In [None]:
# Importing the library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

In [None]:
# Import the data
emotion = pd.read_csv('/content/Emotions.csv')
violence = pd.read_csv('/content/Viol.csv')
hate = pd.read_csv('/content/Hate Speech.csv')

In [None]:
# Data format and copy the data
data_emotion = emotion.copy()
data_violence = violence.copy()
data_hate = hate.copy()

# Data Preprocessing

In [None]:
# Dropping the unwanted column
violence.drop(columns = ['Tweet_ID'],inplace = True)
emotion.drop(columns = ['Unnamed: 0'],inplace = True)
hate.drop(columns = ['Unnamed: 0'],inplace = True)

In [None]:
# Violence
violence

Unnamed: 0,tweet,type
0,Had a dream i got raped last night. By a guy i...,sexual_violence
1,he thought the word raped means sex and told m...,sexual_violence
2,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,Chessy Prout can do better by telling the trut...,sexual_violence
...,...,...
39645,"ENTRY 1299: 21F. 23M, BF’s cousin. Got drunk o...",sexual_violence
39646,So you’re telling me Emmanuel Macron was groom...,sexual_violence
39647,"My wife regularly beats me, I get dirty slaps ...",Physical_violence
39648,Me: Hey babe! Police officer boyfriend: is tha...,sexual_violence


In [None]:
# Again removing unwanted columns
hate.drop(columns = ['count', 'hate_speech', 'offensive_language', 'neither'],inplace = True)

In [None]:
hate = hate[['tweet','class']]

In [None]:
emotion

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [None]:
# Printing all the columns
emotion.columns , violence.columns , hate.columns

(Index(['text', 'label'], dtype='object'),
 Index(['tweet', 'type'], dtype='object'),
 Index(['tweet', 'class'], dtype='object'))

In [None]:
# renaming the columns name to same name
violence.rename(columns={'tweet':'text','type':'label'},inplace=True)
hate.rename(columns={'tweet':'text','class':'label'},inplace= True)

In [None]:
# Printing all the columns
emotion.columns , violence.columns , hate.columns

(Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'))

In [None]:
# Checking for null value
print(emotion.isnull().sum())
print(violence.isnull().sum())
print(hate.isnull().sum())

text     0
label    0
dtype: int64
text     0
label    0
dtype: int64
text     0
label    0
dtype: int64


In [None]:
# Checking Duplicate
print("Duplicate in violence dataframe :",violence.duplicated().sum())
print("Duplicate in emotion dataframe :",emotion.duplicated().sum())
print("Duplicate in hate dataframe :",hate.duplicated().sum())

Duplicate in violence dataframe : 6
Duplicate in emotion dataframe : 686
Duplicate in hate dataframe : 0


In [None]:
# Removing the duplicate Value
emotion.drop_duplicates(inplace=True)
violence.drop_duplicates(inplace=True)
hate.drop_duplicates(inplace=True)

In [None]:
# Checking Duplicate
print("Duplicate in violence dataframe :",violence.duplicated().sum())
print("Duplicate in emotion dataframe :",emotion.duplicated().sum())
print("Duplicate in hate dataframe :",hate.duplicated().sum())

Duplicate in violence dataframe : 0
Duplicate in emotion dataframe : 0
Duplicate in hate dataframe : 0


In [None]:
# Checking the shape
print("Shape of emotion dataframe :",emotion.shape)
print("Shape of violence dataframe :",violence.shape)
print("Shape of hate dataframe :",hate.shape)

Shape of emotion dataframe : (416123, 2)
Shape of violence dataframe : (39644, 2)
Shape of hate dataframe : (24783, 2)


* We want the equal number of rows from each dataset , so that unbalanced dataset should not create

In [None]:
emotion['label'].value_counts(), violence['label'].value_counts(), hate['label'].value_counts()

(label
 1    140779
 0    120989
 3     57235
 4     47664
 2     34497
 5     14959
 Name: count, dtype: int64,
 label
 sexual_violence                 32646
 Physical_violence                5946
 emotional_violence                648
 economic_violence                 217
 Harmful_Traditional_practice      187
 Name: count, dtype: int64,
 label
 1    19190
 2     4163
 0     1430
 Name: count, dtype: int64)



```
We will extract the dataset in same ration of labels
```



In [None]:
# Extracting 12000 rows
emotion_new = pd.DataFrame()
for i in range(len(emotion['label'].value_counts())):
  subset = emotion[emotion['label']==i].sample(n = 2000 , random_state=42)
  emotion_new = pd.concat([emotion_new,subset])

In [None]:
emotion_new.reset_index(drop=True,inplace=True)

In [None]:
# Now for the violence Dataframe
# We don't have much data, so we will combine lower value into one single category
sexual_voilence = violence[violence['label']=='sexual_violence'].sample(5002,random_state=42)
violence_df = violence[violence['label'] != 'sexual_violence']
violence_new = pd.concat([sexual_voilence,violence_df],axis = 0)

In [None]:
# Same for the hate
offensive = hate[hate['label']==1].sample(6407,random_state=42)
hate_df = hate[hate['label'] != 1]
hate_new = pd.concat([offensive,hate_df],axis = 0)

In [None]:
# Resetting the indexes
emotion_new.reset_index(drop=True,inplace=True)
violence_new.reset_index(drop=True,inplace=True)
hate_new.reset_index(drop=True,inplace=True)

## Label Encoding

* In Violence Data Frame we have categorical output column

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
violence_new['label'] = le.fit_transform(violence_new['label'])

# Textual Preprocessing

## a) Lower Case

In [None]:
def lower_case(text):
  return str(text).lower()

In [None]:
emotion_new['text']=emotion_new['text'].apply(lower_case)
violence_new['text']=violence_new['text'].apply(lower_case)
hate_new['text']=hate_new['text'].apply(lower_case)

In [None]:
emotion_new.head(2)

Unnamed: 0,text,label
0,i feel totally lost i just found out that my g...,0
1,i cooked the scallops a couple of days later b...,0


## b) Removing stripping

In [None]:
def remove_extra_space(text):
  return str(text).strip()

In [None]:
emotion_new['text']=emotion_new['text'].apply(remove_extra_space)
violence_new['text']=violence_new['text'].apply(remove_extra_space)
hate_new['text']=hate_new['text'].apply(remove_extra_space)

In [None]:
emotion_new.head(2)

Unnamed: 0,text,label
0,i feel totally lost i just found out that my g...,0
1,i cooked the scallops a couple of days later b...,0


## c) Unwanted charector

In [None]:
import re

In [None]:
def remove_special_chr(text):
  return re.sub(r'[^\w\d\s]','',str(text))

In [None]:
emotion_new['text']=emotion_new['text'].apply(remove_special_chr)
violence_new['text']=violence_new['text'].apply(remove_special_chr)
hate_new['text']=hate_new['text'].apply(remove_special_chr)

In [None]:
emotion_new.head(2)


Unnamed: 0,text,label
0,i feel totally lost i just found out that my g...,0
1,i cooked the scallops a couple of days later b...,0


# Stopwords Removal

In [None]:
# Importing the required library for removing the stop words
import nltk
from nltk.corpus import stopwords
# Get all the stop words
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def remove_stopwords(text):
  all_words = nltk.word_tokenize(text)
  filtered_words = [word for word in all_words if word.lower() not in stop_words]
  return " ".join(filtered_words)

In [None]:
emotion_new['text'].apply(remove_stopwords)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


# Tokenization & Padding