# CNN with word embeddings

### Importing prerequisite libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

### Loading datasets and dropping nulls

In [2]:
data = pd.read_csv('./Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('./Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('./Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [4]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10938 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10938 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.4+ KB


In [5]:
data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [6]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [7]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [8]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [9]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [10]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [11]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [12]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [13]:
vocab_size = 10000

In [14]:
data_x_n = data_x.to_numpy()
data_x_n

data1_x_n = data1_x.to_numpy()
data1_x_n

data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

### Finding max sentence length

In [15]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [16]:
sen_len = max_sen_length(data_x_n)

sen_len1 = max_sen_length(data1_x_n)

sen_len2 = max_sen_length(data2_x_n)

### One hot encoding

In [17]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

### Padding seuqnces to the same length

In [18]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [19]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [20]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [21]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 9515
 4190 7414 6335 3372 9084 6335 3372 9250 6335 9008 2201]


In [22]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 6353  154 1567  236 7837
  236 6267 8208  735]


In [23]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 6353  154  512 6021  236 5709 6021  236 5186
 6021 8208  735]


### Cnn model

In [24]:
embedding_vector_features = 100

In [25]:
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 151, 32)           320000    
_________________________________________________________________
conv1d (Conv1D)              (None, 151, 32)           3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 75, 32)            0         
_________________________________________________________________
flatten (Flatten)            (None, 2400)              0         
_________________________________________________________________
dense (Dense)                (None, 250)               600250    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 251       
Total params: 923,605
Trainable params: 923,605
Non-trainable params: 0
__________________________________________________

In [27]:
model1 = Sequential()
model1.add(Embedding(vocab_size, 32, input_length=sen_len1))
model1.add(Conv1D(32, 3, padding='same', activation='relu'))
model1.add(MaxPooling1D())
model1.add(Flatten())
model1.add(Dense(250, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [28]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 144, 32)           320000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 144, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 72, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)               576250    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 251       
Total params: 899,605
Trainable params: 899,605
Non-trainable params: 0
________________________________________________

In [29]:
model2 = Sequential()
model2.add(Embedding(vocab_size, 32, input_length=sen_len2))
model2.add(Conv1D(32, 3, padding='same', activation='relu'))
model2.add(MaxPooling1D())
model2.add(Flatten())
model2.add(Dense(250, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [30]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 143, 32)           320000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 143, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 71, 32)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 2272)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 250)               568250    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 251       
Total params: 891,605
Trainable params: 891,605
Non-trainable params: 0
________________________________________________

### Train-test split

In [31]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

### Model fitting

In [32]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10, verbose=2)

Epoch 1/10
274/274 - 3s - loss: 0.3767 - accuracy: 0.8661 - val_loss: 0.3487 - val_accuracy: 0.8739
Epoch 2/10
274/274 - 3s - loss: 0.2268 - accuracy: 0.9122 - val_loss: 0.3492 - val_accuracy: 0.8702
Epoch 3/10
274/274 - 3s - loss: 0.0982 - accuracy: 0.9670 - val_loss: 0.5388 - val_accuracy: 0.8688
Epoch 4/10
274/274 - 3s - loss: 0.0398 - accuracy: 0.9886 - val_loss: 0.6986 - val_accuracy: 0.8629
Epoch 5/10
274/274 - 3s - loss: 0.0191 - accuracy: 0.9952 - val_loss: 0.7595 - val_accuracy: 0.8346
Epoch 6/10
274/274 - 3s - loss: 0.0112 - accuracy: 0.9971 - val_loss: 0.9622 - val_accuracy: 0.8629
Epoch 7/10
274/274 - 3s - loss: 0.0055 - accuracy: 0.9987 - val_loss: 0.9789 - val_accuracy: 0.8441
Epoch 8/10
274/274 - 3s - loss: 0.0047 - accuracy: 0.9987 - val_loss: 1.0304 - val_accuracy: 0.8231
Epoch 9/10
274/274 - 3s - loss: 0.0042 - accuracy: 0.9987 - val_loss: 1.0705 - val_accuracy: 0.8446
Epoch 10/10
274/274 - 3s - loss: 0.0039 - accuracy: 0.9990 - val_loss: 1.1387 - val_accuracy: 0.7957

<tensorflow.python.keras.callbacks.History at 0x7f31d0055ca0>

In [33]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10, verbose=2)

Epoch 1/10
270/270 - 3s - loss: 0.3766 - accuracy: 0.8670 - val_loss: 0.3665 - val_accuracy: 0.8621
Epoch 2/10
270/270 - 3s - loss: 0.2382 - accuracy: 0.9072 - val_loss: 0.4255 - val_accuracy: 0.8556
Epoch 3/10
270/270 - 3s - loss: 0.0947 - accuracy: 0.9675 - val_loss: 0.6188 - val_accuracy: 0.8412
Epoch 4/10
270/270 - 3s - loss: 0.0288 - accuracy: 0.9902 - val_loss: 0.8267 - val_accuracy: 0.8295
Epoch 5/10
270/270 - 3s - loss: 0.0091 - accuracy: 0.9979 - val_loss: 0.9882 - val_accuracy: 0.8281
Epoch 6/10
270/270 - 3s - loss: 0.0047 - accuracy: 0.9986 - val_loss: 1.0725 - val_accuracy: 0.8193
Epoch 7/10
270/270 - 3s - loss: 0.0035 - accuracy: 0.9990 - val_loss: 1.1383 - val_accuracy: 0.8007
Epoch 8/10
270/270 - 3s - loss: 0.0030 - accuracy: 0.9993 - val_loss: 1.2465 - val_accuracy: 0.8105
Epoch 9/10
270/270 - 3s - loss: 0.0024 - accuracy: 0.9994 - val_loss: 1.4315 - val_accuracy: 0.8365
Epoch 10/10
270/270 - 3s - loss: 0.0028 - accuracy: 0.9993 - val_loss: 1.4082 - val_accuracy: 0.8281

<tensorflow.python.keras.callbacks.History at 0x7f31a82014c0>

In [34]:
model2.fit(x2_train, y2_train, validation_data=(x2_test, y2_test), epochs=10, verbose=2)

Epoch 1/10
274/274 - 3s - loss: 0.3722 - accuracy: 0.8683 - val_loss: 0.3580 - val_accuracy: 0.8624
Epoch 2/10
274/274 - 3s - loss: 0.2333 - accuracy: 0.9018 - val_loss: 0.3842 - val_accuracy: 0.8573
Epoch 3/10
274/274 - 3s - loss: 0.0976 - accuracy: 0.9669 - val_loss: 0.5479 - val_accuracy: 0.8560
Epoch 4/10
274/274 - 3s - loss: 0.0332 - accuracy: 0.9898 - val_loss: 0.7041 - val_accuracy: 0.8555
Epoch 5/10
274/274 - 3s - loss: 0.0142 - accuracy: 0.9968 - val_loss: 0.8787 - val_accuracy: 0.8583
Epoch 6/10
274/274 - 3s - loss: 0.0076 - accuracy: 0.9981 - val_loss: 0.9418 - val_accuracy: 0.8519
Epoch 7/10
274/274 - 3s - loss: 0.0049 - accuracy: 0.9985 - val_loss: 1.0184 - val_accuracy: 0.8523
Epoch 8/10
274/274 - 3s - loss: 0.0027 - accuracy: 0.9991 - val_loss: 1.0137 - val_accuracy: 0.8363
Epoch 9/10
274/274 - 3s - loss: 0.0031 - accuracy: 0.9993 - val_loss: 1.0901 - val_accuracy: 0.8427
Epoch 10/10
274/274 - 3s - loss: 0.0034 - accuracy: 0.9989 - val_loss: 1.0724 - val_accuracy: 0.8368

<tensorflow.python.keras.callbacks.History at 0x7f31a80b74c0>

## Model accuracies

### Without POS and stemming

In [35]:
y_pred = model.predict_classes(x_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [36]:
confusion_matrix(y_test, y_pred)

array([[1626,  289],
       [ 158,  115]])

In [37]:
accuracy_score(y_test, y_pred)

0.7957038391224863

### POS

In [38]:
y1_pred = model1.predict_classes(x1_test)

In [39]:
confusion_matrix(y1_test, y1_pred)

array([[1714,  137],
       [ 233,   69]])

In [40]:
accuracy_score(y1_test, y1_pred)

0.8281467719461217

###  Stemming and POS

In [41]:
y2_pred = model2.predict_classes(x2_test)

In [42]:
confusion_matrix(y2_test, y2_pred)

array([[1735,  151],
       [ 206,   95]])

In [43]:
accuracy_score(y2_test, y2_pred)

0.8367626886145405