# Natural Language Processing

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [26]:
dataset = pd.read_csv('/kaggle/input/sentimentanalysis/hate.csv', encoding='windows-1252')
dataset = pd.get_dummies(dataset, columns=['label'])
dataset[['label_N', 'label_O', 'label_P']] = dataset[['label_N', 'label_O', 'label_P']].astype(int)
dataset.drop('Unnamed: 0', axis=1, inplace = True)
dataset.reset_index()
dataset.head()

Unnamed: 0,comment,label_N,label_O,label_P
0,dalits are lowlives,1,0,0
1,gay people are a burden to our society,1,0,0
2,Arabs are not welcome,1,0,0
3,I'm not saying we should actually eliminate he...,1,0,0
4,bananas are for black people,1,0,0


In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  41144 non-null  object
 1   label_N  41144 non-null  int64 
 2   label_O  41144 non-null  int64 
 3   label_P  41144 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.3+ MB


In [28]:
dataset.isnull().sum()

comment    0
label_N    0
label_O    0
label_P    0
dtype: int64

In [29]:
dataset[2678:]

Unnamed: 0,comment,label_N,label_O,label_P
2678,This computer sucks,0,0,1
2679,My teacher is Chinese and he is awesome,0,0,1
2680,"My teacher is Chinese and he is awful, no surp...",0,1,0
2681,this black hat is rubbish,0,0,1
2682,"this black hat is rubbish, makes me look like ...",1,0,0
...,...,...,...,...
41139,Indian culture is so backwards,1,0,0
41140,"Women don't want equality, they just want to b...",1,0,0
41141,fuck covid,0,0,1
41142,This computer is garbage,0,0,1


In [30]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset) ):
  review = re.sub('[^a-zA-Z]', ' ', dataset['comment'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [31]:
print(corpus)

['dalit lowliv', 'gay peopl burden societi', 'arab not welcom', 'not say actual elimin heeb wish natur becam extinct', 'banana black peopl', 'asian peopl shit', 'women not reproduc kid men case', 'n wear black summer', 'care chines peopl think', 'tri punch letter box see one', 'feel sorri poor littl half cast boy street neither one thing', 'drink suck aw', 'gay purposeless', 'thought world free black make happi', 'rape cultur immigr women even see rape', 'discrimin r rdz not wrong disgust good', 'politician arsehol', 'get rid hindu', 'jewish peopl look violent', 'shitti camera aw take pictur', 'christian massiv problem countri abolish church destroy', 'citi get flood kebab know type mean', 'alway feel unsaf surround arab', 'n politician much wish got rid', 'go kill ami', 'not say actual elimin kike wish natur becam extinct', 'bori johnson cunt', 'new shop block sign swastika door custom way', 'goat fucker goat lover gross bestial', 'mandatori face mask appli ugli asian', 'half cast peo

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1:].values

In [33]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0]])

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [36]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
y_train

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [39]:
y_test

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [40]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

ann = tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann.add(tf.keras.layers.Dense(units=32, activation='relu'))
ann.add(tf.keras.layers.Dense(units=32, activation='relu'))

ann.add(tf.keras.layers.Dense(units=3, activation='softmax'))

ann.compile(optimizer='adam', loss='CategoricalCrossentropy', metrics=['accuracy'])

ann.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=100, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x7a912cb92fe0>

In [41]:
decoded_data = np.argmax(ann.predict(X_test).round(), axis=1)
print(decoded_data)
labels = ['N', 'O', 'P']

y_pred = [labels[idx] for idx in decoded_data]

[2 2 2 ... 2 0 2]


In [42]:
y_test

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [43]:
labels = ['N', 'O', 'P']

decoded_indices = np.argmax(y_test, axis=1)

y_test = [labels[idx] for idx in decoded_indices]

In [44]:
y_pred = np.array(y_pred)
y_test = np.array(y_test)

print("Shape of y_pred:", y_pred.shape)
print("Shape of y_test:", y_test.shape)

if y_pred.shape[0] == y_test.shape[0]:
    concatenated = np.concatenate((y_pred.reshape(-1, 1), y_test.reshape(-1, 1)), axis=1)
    print("\n", concatenated)
else:
    print("Shapes of y_pred and y_test do not match.")


Shape of y_pred: (8229,)
Shape of y_test: (8229,)

 [['P' 'P']
 ['P' 'N']
 ['P' 'P']
 ...
 ['P' 'N']
 ['N' 'P']
 ['P' 'P']]


In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[3207    0 1206]
 [   5    0    1]
 [1625    0 2185]]


0.655243650504314

In [46]:
# Test Positive Review
new_review = 'This is best.'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = ann.predict(new_X_test).round()
print(new_y_pred)
labels = ['N', 'O', 'P']
new_y_pred = np.argmax(new_y_pred)
[labels[new_y_pred]]

[[0. 0. 1.]]


['P']

In [49]:
# Test Negative Review
new_review = 'I hate chinese'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = ann.predict(new_X_test).round()
labels = ['N', 'O', 'P']
new_y_pred = np.argmax(new_y_pred)
[labels[new_y_pred]]



['N']

In [48]:
import pickle
pickle.dump(ann, open("SentimentAnalysisModel.sav", "wb"))
pickle.dump(cv, open("SentimentAnalysisScaler.sav", "wb"))