# LSTM with Word Embedding

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential
from collections import Counter
from imblearn.over_sampling import SMOTE

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Reading datasets

In [2]:
data_raw  = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

In [3]:
data_raw.dropna(inplace=True)

In [4]:
data_raw.head()

Unnamed: 0,Msg,Tag
0,jaydillz my babies pussy is too tight today t...,1
1,seymourblanco they game is over fuck yall bit...,1
2,can you let me stretch that pussy out or nahhh,1
3,don t mind that twinkies are gonna be gone ver...,2
4,cnt nobody be mad at who he choose to be with ...,1


In [5]:
data_raw_x=data_raw["Msg"]
data_raw_y=data_raw["Tag"]

In [6]:
data_raw_y.value_counts()

1    19190
2     4163
0     1430
Name: Tag, dtype: int64

In [7]:
vocab_size = 10000

In [8]:
data_raw_x_n = data_raw_x.to_numpy()
data_raw_x_n

array([' jaydillz my babies pussy is too tight today t co if k v ro',
       ' seymourblanco they game is over fuck yall bitches amp yall attitudes t co rlrnybfedt ',
       'can you let me stretch that pussy out or nahhh', ...,
       'need some hispanic pussy',
       ' o mygotti you have a girlfriend stop asking these hoes to be your bestfriend ',
       ' kingtunchi jd told me i m to player to be with one bitch'],
      dtype=object)

# Maximum Sentence Length

In [9]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [10]:
sen_len_raw = max_sen_length(data_raw_x_n)

# One Hot Representation

In [11]:
onehot_raw_enc = [one_hot(sen, vocab_size) for sen in data_raw_x_n]

# Embedding Representation

In [12]:
embed_repr_raw = pad_sequences(onehot_raw_enc, padding='pre', maxlen=sen_len_raw)

In [13]:
print(embed_repr_raw[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0 7264 4947 9188 4570 4182 8982 9601 9413
 6049 2347 7558 4860 8751 8735]


## Sampling

In [24]:
print(Counter(data_raw_y))
oversample_raw = SMOTE()
x_raw, y_raw = oversample_raw.fit_sample(embed_repr_raw, data_raw_y)
print(Counter(y_raw))

Counter({1: 19190, 2: 4163, 0: 1430})
Counter({1: 19190, 2: 19190, 0: 19190})


In [15]:
#x_raw, y_raw = embed_repr_raw, data_raw_y

#  LSTM Model Creation

In [25]:
embedding_vector_features = 100

In [26]:
model_raw = Sequential()
model_raw.add(Embedding(vocab_size, embedding_vector_features,input_length=sen_len_raw))
model_raw.add(LSTM(100))
model_raw.add(Dense(3,activation='softmax'))
model_raw.compile(loss='categorical_crossentropy',optimizer='rmsprop', metrics=['accuracy'])
print(model_raw.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 34, 100)           1000000   
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 1,080,703
Trainable params: 1,080,703
Non-trainable params: 0
_________________________________________________________________
None


# Train Test Split

In [27]:
x_raw_train, x_raw_test, y_raw_train, y_raw_test = train_test_split(x_raw, y_raw, test_size=0.2, random_state=4)

# Model Training

In [28]:
model_raw.fit(x_raw_train,pd.get_dummies(y_raw_train), validation_data=(x_raw_test,pd.get_dummies(y_raw_test)),epochs=10,batch_size=64)

Train on 46056 samples, validate on 11514 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f676ab0e518>

# Prediction and Model Accuracy (Raw data) with sampling

In [29]:
pred = model_raw.predict(x_raw_test)
y_raw_pred = list()
for i in range(len(pred)):
    y_raw_pred.append(np.argmax(pred[i]))

## Without Sampling

In [42]:
print("Confusion matrix : \n",confusion_matrix(y_raw_test, y_raw_pred))
print("Accuracy score   : ",accuracy_score(y_raw_test, y_raw_pred))
print("F1 score         : ",f1_score(y_raw_test, y_raw_pred, average='weighted'))
print("Recall           : ",recall_score(y_raw_test, y_raw_pred, average = 'macro'))
print("Precision        : ",precision_score(y_raw_test, y_raw_pred, average='macro'))

Confusion matrix :  
[[  90  154   23] 
[ 213 3542   89] 
[  32  156  658]] 
Accuracy score   :  0.8654428081500908
F1 score         :  0.8688919747661351
Recall           :  0.6787641445418341
Precision        :  0.6809081656759157


## After sampling

In [43]:
print("Confusion matrix : \n",confusion_matrix(y_raw_test, y_raw_pred))
print("Accuracy score   : ",accuracy_score(y_raw_test, y_raw_pred))
print("F1 score         : ",f1_score(y_raw_test, y_raw_pred, average='weighted'))
print("Recall           : ",recall_score(y_raw_test, y_raw_pred, average = 'macro'))
print("Precision        : ",precision_score(y_raw_test, y_raw_pred, average='macro'))

Confusion matrix : 
[[2131  162 1540] 
[ 179 3562  121] 
[1487  127 2205]] 
Accuracy score   :  0.6859475421226333 
F1 score         :  0.6860907725678371 
Recall           :  0.6852192352960884 
Precision        :  0.6855146891229423
