# LSTM with word embedding model

In [125]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential

In [54]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/rhino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Reading datasets

In [6]:
data  = pd.read_csv('Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [41]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10938 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10938 non-null  object
 1   Tag     10938 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.4+ KB


In [7]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [42]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [8]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [43]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [115]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [136]:
vocab_size = 10000

In [137]:
data_x_n = data_x.to_numpy()
data_x_n

array(['The thing disgusting White woman groid White woman drags White child filth ',
       'Americans acting like know talking ',
       'Also intrested check webpage info european american town building ',
       ..., 'No truth 88WHITE POWERWHITE VICTORYWHITE PRIDE88',
       '4 cyclinder motorcycle Historic vehicle Laurin Klement T B 18991903 YouTube Historic vehicle Torpedo V4 1909 YouTube Historic vehicle Torpedo V4 1909 httpthekneeslidercomimages2012rightsidejpg Handlebar camera mount ride video ',
       'Hi I thought I leave note wish Southern Gentlemen Ladies happy Robert Edward Lee day '],
      dtype=object)

In [150]:
data1_x_n = data1_x.to_numpy()
data1_x_n

array(['TheDT thingNN disgustingVBG womanNN groidNN womanNN dragsVBZ childNN filthNN',
       'AmericansNNPS actingVBG likeIN knowNNS talkingVBG',
       'AlsoRB intrestedVBN checkNN webpageNN infoJJ europeanJJ americanJJ townNN buildingNN',
       ..., 'NoDT truthNN 88WHITECD',
       '4CD cyclinderNN motorcycleNN vehicleNN 18991903CD vehicleNN 1909CD vehicleNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leaveVBP noteJJ wishJJ happyJJ dayNN'],
      dtype=object)

In [151]:
data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

# Preprocessing the data and converting into one hot representation

In [118]:
def process_data(data):
    ps = PorterStemmer()
    processed_data = []
    mx_len = 0
    for sen in data:
        sen = re.sub('[^a-zA-Z]',' ', sen)
        sen = sen.lower()
        words = sen.split()
        stem_words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
        if len(stem_words) > mx_len:
            mx_len = len(stem_words)
        sen = ' '.join(stem_words)
        processed_data.append(sen)
    return processed_data,mx_len

In [138]:
processed_data, sen_len = process_data(data_x_n)

In [157]:
processed_data1, sen_len1 = process_data(data1_x_n)

In [158]:
processed_data2, sen_len2 = process_data(data2_x_n)

In [139]:
onehot_enc = [one_hot(sen, vocab_size) for sen in processed_data]

In [159]:
onehot1_enc = [one_hot(sen, vocab_size) for sen in processed_data1]

In [160]:
onehot2_enc = [one_hot(sen, vocab_size) for sen in processed_data2]

In [59]:
print(onehot_enc)

[[3091, 4658, 1571, 3367, 4504, 1571, 3367, 2658, 1571, 1528, 2560], [845, 2380, 2662, 4927, 2421], [333, 4690, 1828, 87, 4846, 550, 845, 3365, 2789], [3979, 2254, 3262, 2998, 1488, 326, 1134, 819, 4325], [3979, 3363, 2174, 3573, 1134, 991, 521, 2118, 1158, 3573, 1282], [2003, 1449, 1803, 1290, 3822, 4861], [4941, 3123, 1276, 4002, 1539, 679, 2997, 2314, 1276, 4674, 3377, 2888, 3346, 1708, 4833, 1542, 2997, 2497, 4133, 3160, 912, 4277, 656, 3014, 4134, 939, 1727], [2179, 2554, 4669, 250, 437, 242, 200, 410], [2374, 2658, 3046, 200, 4920], [3954, 1005, 1556, 3449, 2093, 3773, 4469, 3333, 2198, 656, 3727, 4043, 4445], [4116, 4547, 1475, 4440], [2177], [11, 4759, 1285, 1285, 3057, 1623, 4353, 2974, 3341, 3632, 2018, 3632, 3153, 2409, 487, 733, 2710, 2448, 3029, 307, 2115], [3064, 3042, 2591, 755, 849, 4834, 753, 3262, 2309, 2241, 88, 3628, 2031, 4051, 3628, 2031, 2084, 3628, 2031], [4469, 4840, 3714, 4096, 250, 4548, 2730, 4726], [2604, 4367, 160, 3161, 1591, 2211], [46, 3318, 1396, 2784,

In [186]:
print(onehot1_enc)

[[546, 9981, 6657, 4775, 3536, 4775, 7337, 620, 6279], [569, 5037, 6982, 9443, 26], [7904, 3177, 1837, 8845, 9583, 6001, 3371, 7595, 1162], [8009, 1356, 8345, 8659, 196, 7185, 6511, 3623, 881, 5878], [8009, 1356, 7973, 3317, 3623, 46, 5989, 2098, 32, 9079, 3836], [2668, 3224, 1512, 9314, 1379, 6824, 501], [8009, 8031, 5483, 4653, 8009, 3891, 9748, 6535, 3148, 4653, 8009, 5581, 3473, 5572, 5100, 8439, 2419, 9678, 3148, 5485, 6923, 2235, 4820, 3027, 819, 8337, 1009, 4781, 2651], [4883, 735, 9313, 1426, 4686, 1359], [8009, 5620, 8179, 8364, 3299, 8009, 1260], [319, 7697, 9967, 3972, 7299, 1561, 1848, 6779, 1840, 8889, 9865, 5045], [9438, 9103, 4023, 4614, 5030], [1050], [3548, 1907, 2178, 6360, 7637, 3871, 5430, 4496, 5697, 4721, 268, 1789], [1869, 8009, 2914, 8428, 1486, 5635, 1318, 849, 1270, 4004, 1997, 2414, 8033, 1997, 2414, 8033, 1997, 8263, 8033], [1282, 9805, 8204, 4407, 6863, 4686, 1207, 6555], [546, 6477, 9840, 3482, 1274, 8630], [1504, 7500, 3565, 1397, 7690, 9718, 2997], [8009

In [187]:
print(onehot2_enc)

[[546, 9981, 9765, 3313, 4775, 7115, 3313, 4775, 8440, 3313, 620, 6279], [3371, 224, 6982, 183, 466], [7904, 2693, 6978, 7012, 9583, 6001, 3371, 7595, 2511], [8009, 1356, 8345, 8659, 196, 4811, 6511, 2035, 881, 5878], [8009, 1356, 7566, 2302, 9079, 2035, 4736, 5205, 9830, 4546, 9079, 1127], [2668, 9434, 7507, 2570, 9966, 1379, 6824, 501], [8009, 8031, 5483, 4653, 8009, 3891, 9748, 6535, 3148, 8759, 4653, 8009, 1026, 613, 5572, 5100, 8439, 1392, 4456, 6261, 5485, 6923, 2235, 4820, 3027, 1840, 9460, 1009, 4781, 2651], [4883, 735, 2782, 2261, 4686, 1359, 1051, 6152, 1035], [8009, 5620, 7908, 8364, 6152, 8009, 1260], [3726, 3998, 3709, 678, 8342, 5556, 1848, 3737, 7917, 9081, 1840, 9341, 7717, 5045], [9438, 3145, 4023, 4614, 643], [1050], [4614, 3548, 63, 63, 1907, 6562, 2855, 1774, 2448, 3571, 6360, 3571, 9048, 9281, 3871, 5430, 4859, 4884, 4721, 268, 1789], [1869, 8009, 2250, 8428, 1486, 5635, 2418, 1152, 1270, 5241, 4004, 1997, 1382, 2414, 8033, 1997, 1881, 2414, 8033, 1997, 7364, 8263,

# Embedding Representation

In [140]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [162]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [163]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [141]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0 3372 1543 4213 8976 9731 4213 8976 5513
 4213 3539 4955]


In [164]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0  546 9981 6657 4775 3536
 4775 7337  620 6279]


In [165]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0  546 9981 9765 3313 4775 7115 3313 4775 8440
 3313  620 6279]


#  LSTM Model Creation

In [142]:
embedding_vector_features = 100

In [143]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 143, 100)          1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [166]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len1))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [167]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len2))
model2.add(LSTM(100))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 143, 100)          1000000   
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


# Train Test Split

In [168]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

# Model Training

In [146]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10,batch_size=64)

Train on 8750 samples, validate on 2188 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc446df3e10>

In [169]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10,batch_size=64)

Train on 8611 samples, validate on 2153 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc4452f4198>

In [170]:
model2.fit(x2_train,y2_train, validation_data=(x2_test,y2_test),epochs=10,batch_size=64)

Train on 8744 samples, validate on 2187 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc444982240>

# Prediction and Model accuracy 

In [173]:
y_pred = model.predict_classes(x_test)

In [174]:
confusion_matrix(y_test, y_pred)

array([[1792,  123],
       [ 216,   57]])

In [175]:
accuracy_score(y_test, y_pred)

0.8450639853747715

# Prediction and Model accuracy

In [180]:
y1_pred = model1.predict_classes(x1_test)

In [181]:
confusion_matrix(y1_test, y1_pred)

array([[1669,  182],
       [ 224,   78]])

In [182]:
accuracy_score(y1_test, y1_pred)

0.8114259173246633

# Prediction and Model accuracy

In [183]:
y2_pred = model2.predict_classes(x2_test)

In [184]:
confusion_matrix(y2_test, y2_pred)

array([[1714,  172],
       [ 203,   98]])

In [185]:
accuracy_score(y2_test, y2_pred)

0.8285322359396433