In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Load data
df=pd.read_csv("hate.csv", encoding='latin-1')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
...,...,...,...
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P


In [5]:
# Check for missing values in each column of the DataFrame
df.isnull().sum()

Unnamed: 0    0
comment       0
label         0
dtype: int64

In [6]:
# summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB


In [7]:
# Check for duplicated rows in the DataFrame
df.duplicated().sum()

0

In [8]:
df.tail(10)

Unnamed: 0.1,Unnamed: 0,comment,label
41134,112100,"Wish I hadnt bought this car, it couldnt be ...",P
41135,113100,I endorse the genocide of Muslims,N
41136,11414,Mandatory masks should be for mozzies only,N
41137,115100,Women are dishonest,N
41138,116100,We must not fail to keep in mind that not all ...,P
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P
41143,121100,The only good Muslim is a dead one,N


In [9]:
# Convert the text in the 'comment' column to lowercase
df["comment"]=df["comment"].str.lower()

In [10]:
df['label'].value_counts()

label
N    22158
P    18950
O       36
Name: count, dtype: int64

In [11]:
# Replace values in the 'label' column with specified replacements
df["label"].replace({"P" : 1, "N":0 ,"O":1},inplace=True)

In [12]:
df['label'].value_counts()

label
0    22158
1    18986
Name: count, dtype: int64

In [13]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,0
1,2,gay people are a burden to our society,0
2,3,arabs are not welcome,0
3,4,i'm not saying we should actually eliminate he...,0
4,5,bananas are for black people,0
...,...,...,...
41139,117100,indian culture is so backwards,0
41140,118100,"women don't want equality, they just want to b...",0
41141,119100,fuck covid,1
41142,1205,this computer is garbage,1


In [14]:
# Import the 'string' module, which contains various string manipulation functions and constants
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
exclude=string.punctuation

In [16]:
#seems to aim to remove punctuation characters from a given data
def remove_punctuation(text):
    return text.translate(str.maketrans("","",exclude))

In [17]:
df["comment"]=df["comment"].apply(remove_punctuation)

In [18]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,0
1,2,gay people are a burden to our society,0
2,3,arabs are not welcome,0
3,4,im not saying we should actually eliminate hee...,0
4,5,bananas are for black people,0
...,...,...,...
41139,117100,indian culture is so backwards,0
41140,118100,women dont want equality they just want to be ...,0
41141,119100,fuck covid,1
41142,1205,this computer is garbage,1


In [19]:
from nltk.corpus import stopwords

In [20]:
# aims to remove stopwords from a given data
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words("english"):
            new_text.append(" ")
        else:
            new_text.append(word)
    x= new_text[:]
    new_text.clear()
    return " ".join(x)


In [21]:
df["comment"]=df["comment"].apply(remove_stopwords)

In [22]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits lowlives,0
1,2,gay people burden society,0
2,3,arabs welcome,0
3,4,im saying actually eliminate heebs w...,0
4,5,bananas black people,0
...,...,...,...
41139,117100,indian culture backwards,0
41140,118100,women dont want equality want charge,0
41141,119100,fuck covid,1
41142,1205,computer garbage,1


In [23]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# Define a function to stem words in a given text
def stem_words(text):
    stemmed_words = [ps.stem(word) for word in text.split()]
    return " ".join(stemmed_words)
df["comment"] = df["comment"].apply(stem_words)


In [24]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalit lowliv,0
1,2,gay peopl burden societi,0
2,3,arab welcom,0
3,4,im say actual elimin heeb wish natur becam ext...,0
4,5,banana black peopl,0
...,...,...,...
41139,117100,indian cultur backward,0
41140,118100,women dont want equal want charg,0
41141,119100,fuck covid,1
41142,1205,comput garbag,1


In [25]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalit lowliv,0
1,2,gay peopl burden societi,0
2,3,arab welcom,0
3,4,im say actual elimin heeb wish natur becam ext...,0
4,5,banana black peopl,0
...,...,...,...
41139,117100,indian cultur backward,0
41140,118100,women dont want equal want charg,0
41141,119100,fuck covid,1
41142,1205,comput garbag,1


In [27]:
df.to_csv('sentiment_analysis_data.csv', index=False)