# LSTM with Word Embedding

In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential

# Oversampling 

## Reading datasets

In [39]:
data  = pd.read_csv('Data_over_sampled/dataset.csv',sep=',',names=['Msg','Tag'], header=None)
data1 = pd.read_csv('Data_over_sampled/dataset_POS.csv',sep=',',names=['Msg','Tag'], header=None)
data2 = pd.read_csv('Data_over_sampled/dataset_stemmed.csv',sep=',',names=['Msg','Tag'], header=None)

In [40]:
data

Unnamed: 0,Msg,Tag
0,The New Jersey classroom half nonwhite,1
1,sense make hispanics securing border think loy...,0
2,DuetschGirlTX No I live tiny ass town one 100 ...,1
3,Every cultured society things therein created ...,1
4,Well misery I hope help 1 100 cm 1000 mm 1 254...,0
...,...,...
19003,The site transcript offers major Hitler speech...,0
19004,The BLMtards crashed LBBQWTF parade last month...,1
19005,Blacks woman pulls gun bus another woman talki...,1
19006,httpifamericakneworg,0


In [41]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [42]:
vocab_size = 10000

In [43]:
data_x_n = data_x.to_numpy()
data_x_n

array(['The New Jersey classroom half nonwhite ',
       'sense make hispanics securing border think loyalties native americans let support motivated wn good record becoming border patrol agents ',
       'DuetschGirlTX No I live tiny ass town one 100 white folks maybe less near San Antonio many dirts around stick together comes White Girl town gotta back ALWAYS Is ',
       ...,
       'Blacks woman pulls gun bus another woman talking loud phone black man difuses situation ',
       'httpifamericakneworg',
       'first aint weirdo joking second inbox may empty check sent message list'],
      dtype=object)

In [44]:
data1_x_n = data1_x.to_numpy()
data1_x_n

array(['RegardsNNS', 'YouTubeNN',
       'downloadNN colorfullyRB illustratedVBD 132CD pageNN ebookNN destructionNN civilizationNN',
       ..., 'IsVBZ hardJJ comprehendNN planNN UtopianJJ',
       'NothingNN happenedVBD exceptIN IPRP gotVBD 88CD paperNN',
       'HePRP tellingVBG truthRB usualJJ filthyJJ jewsNNS needVBP clampNNS soonRB possibleJJ'],
      dtype=object)

In [45]:
data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theyPRP needVBP giveJJ mandatoriNN dnaNN testNN kickNN francNN',
       'IPRP thinkVBP asianJJ hotJJ arentNN girlNN IPRP wannaVBP getVB withaJJ wolfNN prettiNNS wannaVBP petJJ oneCD whiteJJ girlNN oneCD trustNN',
       'WePRP handwrittenVBP accountJJ battlNN shilohVBD 4thCD tennesseNN',
       ...,
       'greitNN duNN erNN fornøydNN medVBD dinJJ nasjonalitetJJ detNN erNN ikkNN dètNN detNN erNN snakkNN omNN såNN slappNN avNN kameratNN',
       'theirPRP$ lieNN thickNN maniNN peoplNN ntJJ seeVBP truthJJ frontNN',
       'whatWP goodJJ scammerNN countriNN brokeVBD lawNN comeNN'],
      dtype=object)

# Maximum Sentence Length

In [46]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [47]:
sen_len = max_sen_length(data_x_n)

In [48]:
sen_len1 = max_sen_length(data1_x_n)

In [49]:
sen_len2 = max_sen_length(data2_x_n)

# One Hot Representation

In [50]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

In [51]:
onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

In [52]:
onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

# Embedding Representation

In [53]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [54]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [55]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [56]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 9586 6072 4463 6375 9027 1979]


In [57]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 2283]


In [58]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 7869 3861 9928 6979 3366
 7748 7876 2008]


#  LSTM Model Creation

In [59]:
embedding_vector_features = 100

In [60]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 151, 100)          1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [61]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len1))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [62]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len2))
model2.add(LSTM(100))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 143, 100)          1000000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


# Train Test Split

In [63]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

# Model Training

In [64]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10,batch_size=64)

Train on 15206 samples, validate on 3802 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb12a1a67b8>

In [65]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10,batch_size=64)

Train on 14940 samples, validate on 3736 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb129837a20>

In [66]:
model2.fit(x2_train,y2_train, validation_data=(x2_test,y2_test),epochs=10,batch_size=64)

Train on 15195 samples, validate on 3799 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb128f2a668>

# Prediction and Model Accuracy (Without POS and Stemming)

In [67]:
y_pred = model.predict_classes(x_test)

In [68]:
confusion_matrix(y_test, y_pred)

array([[1729,  148],
       [   8, 1917]])

In [69]:
accuracy_score(y_test, y_pred)

0.9589689637033141

In [70]:
f1_score(y_test, y_pred)

0.9609022556390978

In [71]:
precision_score(y_test, y_pred)

0.9283292978208233

In [72]:
recall_score(y_test, y_pred)

0.9958441558441559

# Prediction and Model Accuracy (POS)

In [73]:
y1_pred = model1.predict_classes(x1_test)

In [74]:
confusion_matrix(y1_test, y1_pred)

array([[1730,  122],
       [   0, 1884]])

In [75]:
accuracy_score(y1_test, y1_pred)

0.9673447537473233

In [76]:
f1_score(y1_test, y1_pred)

0.9686375321336761

In [77]:
precision_score(y1_test, y1_pred)

0.9391824526420738

In [78]:
recall_score(y1_test, y1_pred)

1.0

# Prediction and Model Accuracy (Stemming + POS)

In [79]:
y2_pred = model2.predict_classes(x2_test)

In [80]:
confusion_matrix(y2_test, y2_pred)

array([[1635,  246],
       [  12, 1906]])

In [81]:
accuracy_score(y2_test, y2_pred)

0.9320873914187944

In [82]:
f1_score(y2_test, y2_pred)

0.9366093366093367

In [83]:
precision_score(y2_test, y2_pred)

0.8856877323420075

In [84]:
recall_score(y2_test, y2_pred)

0.9958441558441559

# Undersampling

In [86]:
data  = pd.read_csv('Data_under_sampled/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('Data_under_sampled/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('Data_under_sampled/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

In [87]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [88]:
vocab_size = 10000
data_x_n = data_x.to_numpy()
data1_x_n = data1_x.to_numpy()
data2_x_n = data2_x.to_numpy()
sen_len = max_sen_length(data_x_n)
sen_len1 = max_sen_length(data1_x_n)
sen_len2 = max_sen_length(data2_x_n)
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]
onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]
onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)
embedding_vector_features = 100

## splitting the data

In [89]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

## Without POS + Stemming

In [90]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [91]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10,batch_size=64)

Train on 2294 samples, validate on 574 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb12c2ad9e8>

In [93]:
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

[[191  94]
 [105 184]]
0.6533101045296167
0.6490299823633158
0.6618705035971223
0.6366782006920415


# POS

In [94]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len1))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [95]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10,batch_size=64)

Train on 2281 samples, validate on 571 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb122a80e80>

In [96]:
y1_pred = model1.predict_classes(x1_test)
print(confusion_matrix(y1_test, y1_pred))
print(accuracy_score(y1_test, y1_pred))
print(f1_score(y1_test, y1_pred))
print(precision_score(y1_test, y1_pred))
print(recall_score(y1_test, y1_pred))

[[181 101]
 [105 184]]
0.6392294220665499
0.6411149825783972
0.6456140350877193
0.6366782006920415


# POS + Stemming

In [97]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len2))
model2.add(LSTM(100))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 143, 100)          1000000   
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [98]:
model2.fit(x2_train,y2_train, validation_data=(x2_test,y2_test),epochs=10,batch_size=64)

Train on 2294 samples, validate on 574 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb100a4d898>

In [99]:
y2_pred = model2.predict_classes(x2_test)
print(confusion_matrix(y2_test, y2_pred))
print(accuracy_score(y2_test, y2_pred))
print(f1_score(y2_test, y2_pred))
print(precision_score(y2_test, y2_pred))
print(recall_score(y2_test, y2_pred))

[[168 121]
 [ 79 206]]
0.6515679442508711
0.6732026143790849
0.6299694189602446
0.7228070175438597
