# CNN with word embeddings

### Importing prerequisite libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Loading datasets and dropping nulls

In [3]:
data = pd.read_csv('dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [6]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10938 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10938 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.4+ KB


In [7]:
data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [8]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [9]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [10]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [11]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [12]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [13]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [14]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [17]:
vocab_size = 10000

In [18]:
data_x_n = data_x.to_numpy()
data_x_n

data1_x_n = data1_x.to_numpy()
data1_x_n

data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

### Finding max sentence length

In [15]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [19]:
sen_len = max_sen_length(data_x_n)

sen_len1 = max_sen_length(data1_x_n)

sen_len2 = max_sen_length(data2_x_n)

### One hot encoding

In [20]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

### Padding seuqnces to the same length

In [21]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [22]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [23]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

### Cnn model

In [27]:
embedding_vector_features = 100

In [46]:
# model = Sequential()
# model.add(Embedding(vocab_size, 32, input_length=sen_len))
# model.add(Conv1D(32, 3, padding='same', activation='relu'))
# model.add(MaxPooling1D())
# model.add(Flatten())
# model.add(Dense(250, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(64, 2, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [47]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 151, 32)           320000    
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 151, 32)           3104      
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 75, 32)            0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 75, 64)            4160      
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 37, 64)            0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 2368)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 64)               

In [37]:
# model1 = Sequential()
# model1.add(Embedding(vocab_size, 32, input_length=sen_len1))
# model1.add(Conv1D(32, 3, padding='same', activation='relu'))
# model1.add(MaxPooling1D())
# model1.add(Flatten())
# model1.add(Dense(250, activation='relu'))
# model1.add(Dense(1, activation='sigmoid'))
# model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model1 = Sequential()
model1.add(Embedding(vocab_size, 32, input_length=sen_len1))
model1.add(Conv1D(32, 3, padding='same', activation='relu'))
model1.add(MaxPooling1D())
model1.add(Conv1D(64, 2, padding='same', activation='relu'))
model1.add(MaxPooling1D())
model1.add(Flatten())
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [38]:
model1.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 144, 32)           320000    
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 144, 32)           3104      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 72, 32)            0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 72, 64)            4160      
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 36, 64)            0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)               

In [39]:
# model2 = Sequential()
# model2.add(Embedding(vocab_size, 32, input_length=sen_len2))
# model2.add(Conv1D(32, 3, padding='same', activation='relu'))
# model2.add(MaxPooling1D())
# model2.add(Flatten())
# model2.add(Dense(250, activation='relu'))
# model2.add(Dense(1, activation='sigmoid'))
# model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model2 = Sequential()
model2.add(Embedding(vocab_size, 32, input_length=sen_len2))
model2.add(Conv1D(32, 3, padding='same', activation='relu'))
model2.add(MaxPooling1D())
model2.add(Conv1D(64, 2, padding='same', activation='relu'))
model2.add(MaxPooling1D())
model2.add(Flatten())
model2.add(Dense(64, activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(32, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [40]:
model2.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 143, 32)           320000    
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 143, 32)           3104      
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 71, 32)            0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 71, 64)            4160      
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 35, 64)            0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 2240)              0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)               

### Train-test split

In [41]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

### Model fitting

In [48]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10, verbose=2)

Epoch 1/10
274/274 - 4s - loss: 0.4181 - accuracy: 0.8654 - val_loss: 0.3368 - val_accuracy: 0.8752
Epoch 2/10
274/274 - 4s - loss: 0.2914 - accuracy: 0.8721 - val_loss: 0.3667 - val_accuracy: 0.8780
Epoch 3/10
274/274 - 4s - loss: 0.1736 - accuracy: 0.9296 - val_loss: 0.4015 - val_accuracy: 0.8505
Epoch 4/10
274/274 - 4s - loss: 0.0933 - accuracy: 0.9683 - val_loss: 0.5747 - val_accuracy: 0.7733
Epoch 5/10
274/274 - 4s - loss: 0.0546 - accuracy: 0.9851 - val_loss: 0.8647 - val_accuracy: 0.8501
Epoch 6/10
274/274 - 4s - loss: 0.0310 - accuracy: 0.9926 - val_loss: 0.9580 - val_accuracy: 0.8245
Epoch 7/10
274/274 - 4s - loss: 0.0161 - accuracy: 0.9958 - val_loss: 1.3978 - val_accuracy: 0.8492
Epoch 8/10
274/274 - 4s - loss: 0.0143 - accuracy: 0.9961 - val_loss: 1.3621 - val_accuracy: 0.8423
Epoch 9/10
274/274 - 5s - loss: 0.0115 - accuracy: 0.9969 - val_loss: 1.5140 - val_accuracy: 0.8469
Epoch 10/10
274/274 - 4s - loss: 0.0072 - accuracy: 0.9978 - val_loss: 1.6132 - val_accuracy: 0.8231

<tensorflow.python.keras.callbacks.History at 0x7f0e478c3518>

In [49]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10, verbose=2)

Epoch 1/10
270/270 - 4s - loss: 0.0044 - accuracy: 0.9988 - val_loss: 1.8289 - val_accuracy: 0.8254
Epoch 2/10
270/270 - 4s - loss: 0.0054 - accuracy: 0.9988 - val_loss: 1.7484 - val_accuracy: 0.8207
Epoch 3/10
270/270 - 4s - loss: 0.0068 - accuracy: 0.9983 - val_loss: 1.6076 - val_accuracy: 0.8268
Epoch 4/10
270/270 - 4s - loss: 0.0085 - accuracy: 0.9980 - val_loss: 1.6474 - val_accuracy: 0.8295
Epoch 5/10
270/270 - 4s - loss: 0.0049 - accuracy: 0.9984 - val_loss: 1.8969 - val_accuracy: 0.7854
Epoch 6/10
270/270 - 4s - loss: 0.0063 - accuracy: 0.9985 - val_loss: 2.1486 - val_accuracy: 0.7794
Epoch 7/10
270/270 - 4s - loss: 0.0034 - accuracy: 0.9994 - val_loss: 1.9153 - val_accuracy: 0.8281
Epoch 8/10
270/270 - 4s - loss: 0.0019 - accuracy: 0.9994 - val_loss: 2.1389 - val_accuracy: 0.8254
Epoch 9/10
270/270 - 4s - loss: 0.0015 - accuracy: 0.9995 - val_loss: 2.2268 - val_accuracy: 0.8147
Epoch 10/10
270/270 - 4s - loss: 0.0015 - accuracy: 0.9993 - val_loss: 2.4396 - val_accuracy: 0.8281

<tensorflow.python.keras.callbacks.History at 0x7f0e476d2f28>

In [44]:
model2.fit(x2_train, y2_train, validation_data=(x2_test, y2_test), epochs=10, verbose=2)

Epoch 1/10
274/274 - 5s - loss: 0.3964 - accuracy: 0.8696 - val_loss: 0.3634 - val_accuracy: 0.8624
Epoch 2/10
274/274 - 4s - loss: 0.2819 - accuracy: 0.8799 - val_loss: 0.3630 - val_accuracy: 0.8715
Epoch 3/10
274/274 - 4s - loss: 0.1538 - accuracy: 0.9432 - val_loss: 0.5139 - val_accuracy: 0.8509
Epoch 4/10
274/274 - 4s - loss: 0.0693 - accuracy: 0.9819 - val_loss: 0.6574 - val_accuracy: 0.8226
Epoch 5/10
274/274 - 4s - loss: 0.0335 - accuracy: 0.9922 - val_loss: 1.1349 - val_accuracy: 0.8514
Epoch 6/10
274/274 - 4s - loss: 0.0227 - accuracy: 0.9946 - val_loss: 1.1475 - val_accuracy: 0.8020
Epoch 7/10
274/274 - 4s - loss: 0.0161 - accuracy: 0.9960 - val_loss: 1.1796 - val_accuracy: 0.8089
Epoch 8/10
274/274 - 4s - loss: 0.0071 - accuracy: 0.9985 - val_loss: 1.5534 - val_accuracy: 0.8304
Epoch 9/10
274/274 - 4s - loss: 0.0134 - accuracy: 0.9965 - val_loss: 1.4758 - val_accuracy: 0.8464
Epoch 10/10
274/274 - 4s - loss: 0.0060 - accuracy: 0.9985 - val_loss: 1.5381 - val_accuracy: 0.8253

<tensorflow.python.keras.callbacks.History at 0x7f0e48af8550>

## Model accuracies

### Without POS and stemming

In [50]:
y_pred = model.predict_classes(x_test)

In [51]:
confusion_matrix(y_test, y_pred)

array([[1694,  221],
       [ 166,  107]])

In [52]:
accuracy_score(y_test, y_pred)

0.823126142595978

In [53]:
f1_score(y_test, y_pred)

0.3560732113144759

In [54]:
precision_score(y_test, y_pred)

0.32621951219512196

In [55]:
recall_score(y_test, y_pred)

0.39194139194139194

### POS

In [56]:
y1_pred = model1.predict_classes(x1_test)

In [57]:
confusion_matrix(y1_test, y1_pred)

array([[1706,  145],
       [ 225,   77]])

In [58]:
accuracy_score(y1_test, y1_pred)

0.8281467719461217

In [59]:
f1_score(y1_test, y1_pred)

0.29389312977099236

In [60]:
precision_score(y1_test, y1_pred)

0.34684684684684686

In [61]:
recall_score(y1_test, y1_pred)

0.25496688741721857

###  Stemming and POS


In [62]:
y2_pred = model2.predict_classes(x2_test)

In [63]:
confusion_matrix(y2_test, y2_pred)

array([[1693,  193],
       [ 189,  112]])

In [64]:
accuracy_score(y2_test, y2_pred)

0.82533150434385

In [65]:
f1_score(y2_test, y2_pred)

0.36963696369636967

In [66]:
precision_score(y2_test, y2_pred)

0.36721311475409835

In [69]:
recall_score(y2_test, y2_pred)

0.37209302325581395