# LSTM with Word Embedding

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential

# Reading datasets

In [6]:
data  = pd.read_csv('Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [8]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10938 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10938 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.4+ KB


In [9]:
data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [10]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [11]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [12]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [13]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [14]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [15]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [16]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [17]:
vocab_size = 10000

In [18]:
data_x_n = data_x.to_numpy()
data_x_n

array(['The thing disgusting White woman groid White woman drags White child filth ',
       'Americans acting like know talking ',
       'Also intrested check webpage info european american town building ',
       ..., 'No truth 88WHITE POWERWHITE VICTORYWHITE PRIDE88',
       '4 cyclinder motorcycle Historic vehicle Laurin Klement T B 18991903 YouTube Historic vehicle Torpedo V4 1909 YouTube Historic vehicle Torpedo V4 1909 httpthekneeslidercomimages2012rightsidejpg Handlebar camera mount ride video ',
       'Hi I thought I leave note wish Southern Gentlemen Ladies happy Robert Edward Lee day '],
      dtype=object)

In [26]:
data1_x_n = data1_x.to_numpy()
data1_x_n

array(['TheDT thingNN disgustingVBG womanNN groidNN womanNN dragsVBZ childNN filthNN',
       'AmericansNNPS actingVBG likeIN knowNNS talkingVBG',
       'AlsoRB intrestedVBN checkNN webpageNN infoJJ europeanJJ americanJJ townNN buildingNN',
       ..., 'NoDT truthNN 88WHITECD',
       '4CD cyclinderNN motorcycleNN vehicleNN 18991903CD vehicleNN 1909CD vehicleNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leaveVBP noteJJ wishJJ happyJJ dayNN'],
      dtype=object)

In [25]:
data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

# Maximum Sentence Length

In [48]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [49]:
sen_len = max_sen_length(data_x_n)

In [50]:
sen_len1 = max_sen_length(data1_x_n)

In [51]:
sen_len2 = max_sen_length(data2_x_n)

# One Hot Representation

In [52]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

In [53]:
onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

In [54]:
onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

In [56]:
print(onehot1_enc)

[[8394, 3607, 988, 2817, 9734, 2817, 5899, 6402, 5190], [7073, 6248, 5558, 3935, 433], [5840, 8576, 6052, 6649, 8685, 9506, 4557, 4913, 6849], [3186, 7159, 8860, 6136, 8525, 4969, 4246, 5311, 8519, 1822], [3186, 7159, 4094, 2460, 5311, 9797, 1061, 6053, 9887, 9044, 6619], [2269, 1530, 3266, 3491, 8299], [3186, 6781, 8641, 9352, 3186, 4887, 7651, 3150, 3135, 9352, 3186, 1437, 7560, 5487, 8215, 3482, 5531, 5762, 3135, 3757, 3252, 1911, 6500, 9201, 3993, 6649, 9511, 4523, 6090], [1783, 8784, 4519, 1898, 5797, 5071], [3186, 4173, 9717, 4211, 9373, 3186, 5666], [2743, 3136, 9084, 8474, 5707, 7188, 9371, 5588, 2100, 3689, 1982, 8561], [4296, 832, 2640, 9605, 6832], [3655], [3568, 3552, 4203, 3125, 4592, 3702, 9593, 9646, 7772, 8481, 6906, 545], [3271, 3186, 591, 4366, 8676, 3004, 8562, 5221, 9160, 5675, 3661, 6911, 7600, 6780, 6911, 7600, 9541, 2315, 7600], [8624, 5100, 8328, 6940, 653, 5797, 4481, 1158], [8394, 5066, 4144, 4910, 5290, 8350], [7756, 8674, 6779, 6276, 2610, 7498, 501], [3186,

In [57]:
print(onehot2_enc)

[[8394, 3607, 4586, 9074, 2817, 872, 9074, 2817, 7880, 9074, 6402, 5190], [4557, 7583, 5558, 5842, 2115], [5840, 9517, 5763, 4730, 8685, 9506, 4557, 4913, 7288], [3186, 7159, 8860, 6136, 8525, 5085, 4246, 2870, 8519, 1822], [3186, 7159, 4624, 85, 9044, 2870, 6961, 20, 2355, 9971, 9044, 9561], [2269, 3886, 7528, 1987, 3252, 8299], [3186, 6781, 8641, 9352, 3186, 4887, 7651, 3150, 3135, 6182, 9352, 3186, 5433, 6473, 5487, 8215, 3482, 6321, 6242, 3523, 3757, 3252, 1911, 6500, 9201, 2100, 5500, 9511, 4523, 6090], [1783, 8784, 8529, 9048, 5797, 5071, 8959, 2035, 644], [3186, 4173, 417, 4211, 2035, 3186, 5666], [5436, 5206, 4419, 4593, 2375, 5457, 9371, 2732, 8404, 2766, 2100, 1110, 9688, 8561], [4296, 4456, 2640, 9605, 5436], [3655], [7620, 3568, 4542, 4542, 3552, 7123, 9893, 7083, 2431, 9872, 3125, 9872, 8632, 3246, 3702, 9593, 4128, 2811, 8481, 6906, 545], [3271, 3186, 6560, 4366, 8676, 3004, 8945, 7293, 9160, 8489, 5675, 3661, 9687, 6911, 7600, 6780, 1141, 6911, 7600, 9541, 7600, 2315, 76

# Embedding Representation

In [58]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [59]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [60]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [61]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 7817
 8679 4890 8613 4711 8244 8613 4711 3163 8613 3018 4066]


In [62]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 8394 3607  988 2817 9734
 2817 5899 6402 5190]


In [64]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 8394 3607 4586 9074 2817  872 9074 2817 7880
 9074 6402 5190]


#  LSTM Model Creation

In [65]:
embedding_vector_features = 100

In [67]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 151, 100)          1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len1))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [69]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len2))
model2.add(LSTM(100))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 143, 100)          1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


# Train Test Split

In [70]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

# Model Training

In [71]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10,batch_size=64)

Train on 8750 samples, validate on 2188 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff894be0048>

In [72]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10,batch_size=64)

Train on 8611 samples, validate on 2153 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff86c5b6780>

In [73]:
model2.fit(x2_train,y2_train, validation_data=(x2_test,y2_test),epochs=10,batch_size=64)

Train on 8744 samples, validate on 2187 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff84bc63710>

# Prediction and Model Accuracy (Without POS and Stemming)

In [74]:
y_pred = model.predict_classes(x_test)

In [75]:
confusion_matrix(y_test, y_pred)

array([[1749,  166],
       [ 187,   86]])

In [76]:
accuracy_score(y_test, y_pred)

0.8386654478976234

# Prediction and Model Accuracy (POS)

In [77]:
y1_pred = model1.predict_classes(x1_test)

In [78]:
confusion_matrix(y1_test, y1_pred)

array([[1674,  177],
       [ 224,   78]])

In [79]:
accuracy_score(y1_test, y1_pred)

0.8137482582443103

# Prediction and Model Accuracy (Stemming + POS)

In [80]:
y2_pred = model2.predict_classes(x2_test)

In [81]:
confusion_matrix(y2_test, y2_pred)

array([[1772,  114],
       [ 224,   77]])

In [82]:
accuracy_score(y2_test, y2_pred)

0.8454503886602652