# CNN with word embeddings

### Importing prerequisite libraries

In [71]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Loading datasets and dropping nulls

In [34]:
data = pd.read_csv('./Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('./Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('./Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [36]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10938 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10938 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.4+ KB


In [37]:
data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [38]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [39]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [40]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [41]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [42]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [43]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [44]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [45]:
vocab_size = 10000

In [46]:
data_x_n = data_x.to_numpy()
data_x_n

data1_x_n = data1_x.to_numpy()
data1_x_n

data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

### Finding max sentence length

In [47]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [48]:
sen_len = max_sen_length(data_x_n)

sen_len1 = max_sen_length(data1_x_n)

sen_len2 = max_sen_length(data2_x_n)

### One hot encoding

In [49]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

### Padding seuqnces to the same length

In [50]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [51]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [52]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [53]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 9354
 6098 3888  619 8431 1096  619 8431 4473  619 7475 7341]


In [54]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 4910  447 7210 4675 3275
 4675 8088 7721 2356]


In [55]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 4910  447 5903 7263 4675 6177 7263 4675 1658
 7263 7721 2356]


### Cnn model

In [56]:
embedding_vector_features = 100

In [57]:
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [58]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 151, 32)           320000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 151, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 75, 32)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 2400)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 250)               600250    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 251       
Total params: 923,605
Trainable params: 923,605
Non-trainable params: 0
________________________________________________

In [59]:
model1 = Sequential()
model1.add(Embedding(vocab_size, 32, input_length=sen_len1))
model1.add(Conv1D(32, 3, padding='same', activation='relu'))
model1.add(MaxPooling1D())
model1.add(Flatten())
model1.add(Dense(250, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [60]:
model1.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 144, 32)           320000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 144, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 72, 32)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 250)               576250    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 251       
Total params: 899,605
Trainable params: 899,605
Non-trainable params: 0
________________________________________________

In [61]:
model2 = Sequential()
model2.add(Embedding(vocab_size, 32, input_length=sen_len2))
model2.add(Conv1D(32, 3, padding='same', activation='relu'))
model2.add(MaxPooling1D())
model2.add(Flatten())
model2.add(Dense(250, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [62]:
model2.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 143, 32)           320000    
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 143, 32)           3104      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 71, 32)            0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 2272)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 250)               568250    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 251       
Total params: 891,605
Trainable params: 891,605
Non-trainable params: 0
________________________________________________

### Train-test split

In [63]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

### Model fitting

In [64]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10, verbose=2)

Train on 8750 samples, validate on 2188 samples
Epoch 1/10
8750/8750 - 5s - loss: 0.3820 - acc: 0.8649 - val_loss: 0.3360 - val_acc: 0.8720
Epoch 2/10
8750/8750 - 5s - loss: 0.2299 - acc: 0.9109 - val_loss: 0.3694 - val_acc: 0.8757
Epoch 3/10
8750/8750 - 5s - loss: 0.0881 - acc: 0.9703 - val_loss: 0.4745 - val_acc: 0.8629
Epoch 4/10
8750/8750 - 4s - loss: 0.0315 - acc: 0.9917 - val_loss: 0.6163 - val_acc: 0.8551
Epoch 5/10
8750/8750 - 5s - loss: 0.0158 - acc: 0.9965 - val_loss: 0.7554 - val_acc: 0.8510
Epoch 6/10
8750/8750 - 4s - loss: 0.0090 - acc: 0.9978 - val_loss: 0.8908 - val_acc: 0.8624
Epoch 7/10
8750/8750 - 5s - loss: 0.0071 - acc: 0.9985 - val_loss: 0.9552 - val_acc: 0.8615
Epoch 8/10
8750/8750 - 5s - loss: 0.0056 - acc: 0.9989 - val_loss: 0.9203 - val_acc: 0.8464
Epoch 9/10
8750/8750 - 4s - loss: 0.0052 - acc: 0.9989 - val_loss: 1.1380 - val_acc: 0.8661
Epoch 10/10
8750/8750 - 4s - loss: 0.0036 - acc: 0.9993 - val_loss: 1.0315 - val_acc: 0.8304


<tensorflow.python.keras.callbacks.History at 0x7f4bcb41ce10>

In [65]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10, verbose=2)

Train on 8611 samples, validate on 2153 samples
Epoch 1/10
8611/8611 - 4s - loss: 0.3748 - acc: 0.8697 - val_loss: 0.3623 - val_acc: 0.8597
Epoch 2/10
8611/8611 - 4s - loss: 0.2317 - acc: 0.9102 - val_loss: 0.4355 - val_acc: 0.8579
Epoch 3/10
8611/8611 - 4s - loss: 0.0958 - acc: 0.9668 - val_loss: 0.5697 - val_acc: 0.8407
Epoch 4/10
8611/8611 - 4s - loss: 0.0335 - acc: 0.9908 - val_loss: 0.7677 - val_acc: 0.8175
Epoch 5/10
8611/8611 - 4s - loss: 0.0135 - acc: 0.9977 - val_loss: 0.9364 - val_acc: 0.8198
Epoch 6/10
8611/8611 - 4s - loss: 0.0079 - acc: 0.9981 - val_loss: 1.0341 - val_acc: 0.8216
Epoch 7/10
8611/8611 - 4s - loss: 0.0045 - acc: 0.9987 - val_loss: 1.1083 - val_acc: 0.8077
Epoch 8/10
8611/8611 - 4s - loss: 0.0025 - acc: 0.9994 - val_loss: 1.2763 - val_acc: 0.8384
Epoch 9/10
8611/8611 - 4s - loss: 0.0023 - acc: 0.9994 - val_loss: 1.2859 - val_acc: 0.8254
Epoch 10/10
8611/8611 - 4s - loss: 0.0023 - acc: 0.9994 - val_loss: 1.3204 - val_acc: 0.8142


<tensorflow.python.keras.callbacks.History at 0x7f4bcb41cbe0>

In [66]:
model2.fit(x2_train, y2_train, validation_data=(x2_test, y2_test), epochs=10, verbose=2)

Train on 8744 samples, validate on 2187 samples
Epoch 1/10
8744/8744 - 5s - loss: 0.3782 - acc: 0.8683 - val_loss: 0.3574 - val_acc: 0.8624
Epoch 2/10
8744/8744 - 4s - loss: 0.2333 - acc: 0.8994 - val_loss: 0.3731 - val_acc: 0.8706
Epoch 3/10
8744/8744 - 4s - loss: 0.1154 - acc: 0.9619 - val_loss: 0.5059 - val_acc: 0.8395
Epoch 4/10
8744/8744 - 4s - loss: 0.0496 - acc: 0.9849 - val_loss: 0.6587 - val_acc: 0.8249
Epoch 5/10
8744/8744 - 5s - loss: 0.0245 - acc: 0.9939 - val_loss: 0.9130 - val_acc: 0.8555
Epoch 6/10
8744/8744 - 5s - loss: 0.0156 - acc: 0.9962 - val_loss: 0.9827 - val_acc: 0.8436
Epoch 7/10
8744/8744 - 5s - loss: 0.0076 - acc: 0.9984 - val_loss: 1.0234 - val_acc: 0.8157
Epoch 8/10
8744/8744 - 5s - loss: 0.0060 - acc: 0.9987 - val_loss: 1.0630 - val_acc: 0.8066
Epoch 9/10
8744/8744 - 5s - loss: 0.0045 - acc: 0.9991 - val_loss: 1.1686 - val_acc: 0.8221
Epoch 10/10
8744/8744 - 5s - loss: 0.0041 - acc: 0.9991 - val_loss: 1.4132 - val_acc: 0.8560


<tensorflow.python.keras.callbacks.History at 0x7f4c02374ac8>

## Model accuracies

### Without POS and stemming

In [67]:
y_pred = model.predict_classes(x_test)

In [68]:
confusion_matrix(y_test, y_pred)

array([[1710,  205],
       [ 166,  107]])

In [69]:
accuracy_score(y_test, y_pred)

0.8304387568555759

In [72]:
f1_score(y_test, y_pred)

0.3658119658119658

In [73]:
precision_score(y_test, y_pred)

0.34294871794871795

In [75]:
recall_score(y_test, y_pred)

0.39194139194139194

### POS

In [76]:
y1_pred = model1.predict_classes(x1_test)

In [77]:
confusion_matrix(y1_test, y1_pred)

array([[1673,  178],
       [ 222,   80]])

In [78]:
accuracy_score(y1_test, y1_pred)

0.8142127264282397

In [79]:
f1_score(y1_test, y1_pred)

0.28571428571428575

In [80]:
precision_score(y1_test, y1_pred)

0.31007751937984496

In [81]:
recall_score(y1_test, y1_pred)

0.26490066225165565

###  Stemming and POS

In [82]:
y2_pred = model2.predict_classes(x2_test)

In [83]:
confusion_matrix(y2_test, y2_pred)

array([[1812,   74],
       [ 241,   60]])

In [84]:
accuracy_score(y2_test, y2_pred)

0.8559670781893004

In [85]:
f1_score(y2_test, y2_pred)

0.27586206896551724

In [86]:
precision_score(y2_test, y2_pred)

0.44776119402985076

In [87]:
recall_score(y2_test, y2_pred)

0.19933554817275748