# LSTM with Word Embedding

In [23]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential
from imblearn.over_sampling import SMOTE

# Reading datasets

In [2]:
data  = pd.read_csv('Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

In [38]:
data_raw  = pd.read_csv('Dataset/Dataset_with_stopwords/dataset.csv',sep=',',names=['Msg','Tag'])

In [39]:
data_raw.dropna(inplace=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Msg     10944 non-null  object 
 1   Tag     10937 non-null  float64
dtypes: float64(1), object(1)
memory usage: 171.1+ KB


In [5]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10937 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Msg     10937 non-null  object 
 1   Tag     10937 non-null  float64
dtypes: float64(1), object(1)
memory usage: 256.3+ KB


In [5]:
data.head()

Unnamed: 0,Msg,Tag
0,The thing disgusting White woman groid White w...,1
1,Americans acting like know talking,0
2,Also intrested check webpage info european ame...,0
3,I think need take stand homes across country a...,0
4,I think connection homosexuality Christianity ...,1


In [6]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [7]:
data1.dropna(inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10764 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10764 non-null  object
 1   Tag     10764 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 252.3+ KB


In [8]:
data1.head()

Unnamed: 0,Msg,Tag
0,TheDT thingNN disgustingVBG womanNN groidNN wo...,1
1,AmericansNNPS actingVBG likeIN knowNNS talkingVBG,0
2,AlsoRB intrestedVBN checkNN webpageNN infoJJ e...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homesNNS...,0
4,IPRP thinkVBP connectionNN homosexualityNN cou...,1


In [9]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10944 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10944 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 171.1+ KB


In [10]:
data2.dropna(inplace=True)
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10931 entries, 0 to 10943
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     10931 non-null  object
 1   Tag     10931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 256.2+ KB


In [11]:
data2.head()

Unnamed: 0,Msg,Tag
0,theDT thingNN disgustJJ whiteJJ womanNN groidJ...,1
1,americanJJ actNN likeIN knowJJ talkNN,0
2,alsoRB intrestRB checkVB webpagNN infoJJ europ...,0
3,IPRP thinkVBP needJJ takeVBP standVBP homeNN a...,0
4,IPRP thinkVBP connectJJ homosexuNN christianJJ...,1


In [12]:
data_x=data["Msg"]
data_y=data["Tag"]

data1_x=data1["Msg"]
data1_y=data1["Tag"]

data2_x=data2["Msg"]
data2_y=data2["Tag"]

In [40]:
data_raw_x=data_raw["Msg"]
data_raw_y=data_raw["Tag"]

In [41]:
vocab_size = 10000

In [42]:
data_raw_x_n = data_x.to_numpy()
data_raw_x_n

array(['Please keep that in mind',
       '26155 Yes our languages really are beautiful at least I think so',
       'no doubt thats a lovely gun if i had the money im looking at a benelli as my next shot gun also the price of single shot shot guns have gone sky high',
       ..., 'I hear Liberia is nice this time of the year', 'Ha',
       'This post by Jack boot and the piece by Marc Moran are very inspiring'],
      dtype=object)

In [8]:
data_x_n = data_x.to_numpy()
data_x_n

array(['Please keep that in mind',
       '26155 Yes our languages really are beautiful at least I think so',
       'no doubt thats a lovely gun if i had the money im looking at a benelli as my next shot gun also the price of single shot shot guns have gone sky high',
       ..., 'I hear Liberia is nice this time of the year', 'Ha',
       'This post by Jack boot and the piece by Marc Moran are very inspiring'],
      dtype=object)

In [15]:
data1_x_n = data1_x.to_numpy()
data1_x_n

array(['TheDT thingNN disgustingVBG womanNN groidNN womanNN dragsVBZ childNN filthNN',
       'AmericansNNPS actingVBG likeIN knowNNS talkingVBG',
       'AlsoRB intrestedVBN checkNN webpageNN infoJJ europeanJJ americanJJ townNN buildingNN',
       ..., 'NoDT truthNN 88WHITECD',
       '4CD cyclinderNN motorcycleNN vehicleNN 18991903CD vehicleNN 1909CD vehicleNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leaveVBP noteJJ wishJJ happyJJ dayNN'],
      dtype=object)

In [16]:
data2_x_n = data2_x.to_numpy()
data2_x_n

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

# Maximum Sentence Length

In [28]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [10]:
sen_len = max_sen_length(data_x_n)

In [19]:
sen_len1 = max_sen_length(data1_x_n)

In [20]:
sen_len2 = max_sen_length(data2_x_n)

In [43]:
sen_len_raw = max_sen_length(data_raw_x_n)

# One Hot Representation

In [11]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

In [22]:
onehot1_enc = [one_hot(sen, vocab_size) for sen in data1_x_n]

In [23]:
onehot2_enc = [one_hot(sen, vocab_size) for sen in data2_x_n]

In [44]:
onehot_raw_enc = [one_hot(sen, vocab_size) for sen in data_raw_x_n]

# Embedding Representation

In [12]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [25]:
embed_repr1 = pad_sequences(onehot1_enc, padding='pre', maxlen=sen_len1)

In [26]:
embed_repr2 = pad_sequences(onehot2_enc, padding='pre', maxlen=sen_len2)

In [45]:
embed_repr_raw = pad_sequences(onehot_raw_enc, padding='pre', maxlen=sen_len_raw)

In [46]:
print(embed_repr_raw[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [28]:
print(embed_repr1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 8183  970  403 6693 6093
 6693 4995 4605 5187]


In [29]:
print(embed_repr2[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 8183  970 4506 6615 6693 6716 6615 6693 9057
 6615 4605 5187]


## Sampling

In [47]:
oversample = SMOTE()
x_raw, y_raw = oversample.fit_sample(embed_repr_raw, data_raw_y)

#  LSTM Model Creation

In [48]:
embedding_vector_features = 100

In [49]:
model_raw = Sequential()
model_raw.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model_raw.add(LSTM(100))
model_raw.add(Dense(1,activation='sigmoid'))
model_raw.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model_raw.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 326, 100)          1000000   
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 326, 100)          1000000   
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len1))
model1.add(LSTM(100))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 144, 100)          1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len2))
model2.add(LSTM(100))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 143, 100)          1000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


# Train Test Split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)
x1_train, x1_test, y1_train, y1_test = train_test_split(embed_repr1, data1_y, test_size=0.2, random_state=4)
x2_train, x2_test, y2_train, y2_test = train_test_split(embed_repr2, data2_y, test_size=0.2, random_state=4)

In [50]:
x_raw_train, x_raw_test, y_raw_train, y_raw_test = train_test_split(x_raw, y_raw, test_size=0.2, random_state=4)

# Model Training

In [51]:
model_raw.fit(x_raw_train,y_raw_train, validation_data=(x_raw_test,y_raw_test),epochs=10,batch_size=64)

Train on 15204 samples, validate on 3802 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5124583630>

In [None]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10,batch_size=64)

In [36]:
model1.fit(x1_train,y1_train, validation_data=(x1_test,y1_test),epochs=10,batch_size=64)

Train on 8611 samples, validate on 2153 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f35cb7ca4e0>

In [37]:
model2.fit(x2_train,y2_train, validation_data=(x2_test,y2_test),epochs=10,batch_size=64)

Train on 8744 samples, validate on 2187 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f35caf390b8>

# Prediction and Model Accuracy (Raw data) with sampling

In [52]:
y_raw_pred = model_raw.predict_classes(x_raw_test)

In [53]:
confusion_matrix(y_raw_test, y_raw_pred)

array([[1646,  260],
       [ 228, 1668]])

In [54]:
accuracy_score(y_raw_test, y_raw_pred)

0.8716465018411362

In [55]:
f1_score(y_raw_test, y_raw_pred)

0.8723849372384938

In [56]:
precision_score(y_raw_test, y_raw_pred)

0.8651452282157677

In [57]:
recall_score(y_raw_test, y_raw_pred)

0.879746835443038

# Prediction and Model Accuracy (Without POS and Stemming)

In [17]:
y_pred = model.predict_classes(x_test)

In [18]:
confusion_matrix(y_test, y_pred)

array([[1727,  157],
       [ 187,  117]])

In [19]:
accuracy_score(y_test, y_pred)

0.8427787934186471

In [20]:
f1_score(y_test, y_pred)

0.40484429065743943

In [21]:
precision_score(y_test, y_pred)

0.42700729927007297

In [22]:
recall_score(y_test, y_pred)

0.3848684210526316

# Prediction and Model Accuracy (POS)

In [57]:
y1_pred = model1.predict_classes(x1_test)

In [58]:
confusion_matrix(y1_test, y1_pred)

array([[1669,  182],
       [ 216,   86]])

In [59]:
accuracy_score(y1_test, y1_pred)

0.8151416627960985

In [60]:
f1_score(y1_test, y1_pred)

0.3017543859649122

In [61]:
precision_score(y1_test, y1_pred)

0.3208955223880597

In [62]:
recall_score(y1_test, y1_pred)

0.2847682119205298

# Prediction and Model Accuracy (Stemming + POS)

In [63]:
y2_pred = model2.predict_classes(x2_test)

In [64]:
confusion_matrix(y2_test, y2_pred)

array([[1733,  153],
       [ 223,   78]])

In [65]:
accuracy_score(y2_test, y2_pred)

0.8280749885688158

In [66]:
f1_score(y2_test, y2_pred)

0.2932330827067669

In [67]:
precision_score(y2_test, y2_pred)

0.33766233766233766

In [68]:
recall_score(y_test, y_pred)

0.3553113553113553