In [86]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow import keras
import seaborn as sns
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

In [87]:
emotion_df=pd.read_csv('Emotions.csv')
violence_df=pd.read_csv('Violence.csv')
hate_df=pd.read_csv('Hatespeech.csv')

In [88]:
emotion_df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [89]:
violence_df.head()

Unnamed: 0,Tweet_ID,tweet,type
0,ID_0022DWKP,Had a dream i got raped last night. By a guy i...,sexual_violence
1,ID_00395QYM,he thought the word raped means sex and told m...,sexual_violence
2,ID_003EOSSF,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,ID_004BBHOD,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,ID_004F7516,Chessy Prout can do better by telling the trut...,sexual_violence


In [90]:
hate_df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [91]:
emotion_df.drop(columns=['Unnamed: 0'], inplace=True)
violence_df.drop(columns=['Tweet_ID'], inplace=True)
hate_df=hate_df[['tweet','class']]

In [92]:
emotion_df.columns,violence_df.columns,hate_df.columns

(Index(['text', 'label'], dtype='object'),
 Index(['tweet', 'type'], dtype='object'),
 Index(['tweet', 'class'], dtype='object'))

In [93]:
violence_df.rename(columns={'tweet':'text','type':'label'},inplace=True)
hate_df.rename(columns={'tweet':'text','class':'label'},inplace=True)

In [94]:
emotion_df.columns,violence_df.columns,hate_df.columns

(Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'))

In [95]:
emotion_df.isna().sum(), violence_df.isna().sum(), hate_df.isna().sum()

(text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64,
 text     0
 label    0
 dtype: int64)

In [96]:
emotion_df.shape, violence_df.shape, hate_df.shape

((416809, 2), (39650, 2), (24783, 2))

In [97]:
emotion_df['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [98]:
violence_df['label'].value_counts()

label
sexual_violence                 32648
Physical_violence                5946
emotional_violence                651
economic_violence                 217
Harmful_Traditional_practice      188
Name: count, dtype: int64

In [99]:
hate_df['label'].value_counts()

label
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [100]:
e_df=pd.DataFrame()
for i in  range(len(emotion_df['label'].unique())):
    subset=emotion_df[emotion_df['label']==i].sample(n=2000,random_state=42)
    e_df=pd.concat([e_df,subset])

In [101]:
e_df.shape

(12000, 2)

In [102]:
emotion_df=e_df.copy()

In [103]:
emotion_df['label'].value_counts()

label
0    2000
1    2000
2    2000
3    2000
4    2000
5    2000
Name: count, dtype: int64

In [104]:
sexual_violence=violence_df[violence_df['label']=='sexual_violence'].sample(n=4998, random_state=42)
violence_df=violence_df[violence_df['label']!='sexual_violence']

In [105]:
violence_df=pd.concat([violence_df,sexual_violence], axis=0)

In [106]:
violence_df['label'].value_counts()

label
Physical_violence               5946
sexual_violence                 4998
emotional_violence               651
economic_violence                217
Harmful_Traditional_practice     188
Name: count, dtype: int64

In [107]:
offensive_speech=hate_df[hate_df['label']==1].sample(n=6407, random_state=42)
hate_df=hate_df[hate_df['label']!=1]

In [109]:
hate_df=pd.concat([offensive_speech,hate_df], axis=0)

In [135]:
hate_df['label'].value_counts()

label
1    6407
2    4163
0    1430
Name: count, dtype: int64

In [139]:
hate_df.shape,emotion_df.shape, violence_df.shape

((12000, 2), (12000, 2), (12000, 2))

In [141]:
label_encoder=LabelEncoder()
violence_df['label']=label_encoder.fit_transform(violence_df['label'])

In [143]:
violence_df

Unnamed: 0,text,label
6,"My Husband Beats Me Frequently, Wife Tells Cou...",1
29,"Best thing for me to do, is remain silent when...",1
30,"My husband will never beat me, Bambam denies r...",1
33,"theyre like, i just wanna be a baby maker with...",1
35,"I was in England for a week, the longest I’ve ...",1
...,...,...
25368,i was 13 and had just been raped. the police t...,4
13942,So bad.She/he don't know how blessed they are ...,4
34704,I was Hide and he was Yoshi. We were Hideyoshi...,4
18626,It actually sickens me that he's so smug abt '...,4


In [227]:
violence_df['label'].unique()

array([1, 3, 0, 2, 4])

In [229]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ambik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [231]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ambik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ambik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [233]:
stop_words=set(stopwords.words('english'))

In [235]:
len(stop_words)

179

In [237]:
def remove_stopwords(text):
    all_words=nltk.word_tokenize(text)
    filtered_words=[word for word in all_words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [239]:
emotion_df['text']=emotion_df['text'].apply(remove_stopwords)
violence_df['text']=violence_df['text'].apply(remove_stopwords)
hate_df['text']=hate_df['text'].apply(remove_stopwords)

In [241]:
emotion_df.head()

Unnamed: 0,text,label
133243,ivelearnedsurroundwomenliftleavefeelingnurture...,0
88501,alreadyfeelcrappyupsetsituationdoesnthelp,0
131379,feellikelostmournedmovedpasttearsrelationship,0
148369,couldwritewholelotimfeelingcrappydontthinkwoul...,0
134438,alwaysseemfeelinadequate,0
