In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential

# Reading Data

In [2]:
data  = pd.read_csv('Dataset/Data_processed/dataset.csv',sep=',',names=['Msg','Tag'])
data1 = pd.read_csv('Dataset/Data_processed/dataset_POS.csv',sep=',',names=['Msg','Tag'])
data2 = pd.read_csv('Dataset/Data_processed/dataset_stemmed.csv',sep=',',names=['Msg','Tag'])

In [3]:
data.dropna(inplace=True)

In [4]:
data1.dropna(inplace=True)

In [5]:
data2.dropna(inplace=True)

In [6]:
data_X  = data['Msg'].to_numpy()
data_Y  = data['Tag'].to_numpy()
data1_X = data1['Msg'].to_numpy()
data1_Y = data1['Tag'].to_numpy()
data2_X = data2['Msg'].to_numpy()
data2_Y = data2['Tag'].to_numpy()

In [7]:
data_X  = np.reshape(data_X,  (data_X.shape[0],1 ))
data_Y  = np.reshape(data_Y,  (data_Y.shape[0],1 ))
data1_X = np.reshape(data1_X, (data1_X.shape[0],1))
data1_Y = np.reshape(data1_Y, (data1_Y.shape[0],1)) 
data2_X = np.reshape(data2_X, (data2_X.shape[0],1))
data2_Y = np.reshape(data2_Y, (data2_Y.shape[0],1))

## Attention Layer

In [8]:
class attention(Layer):    
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
        super(attention,self).__init__()
        
    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")
        super(attention,self).build(input_shape)
        
    def call(self, x):        
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

## Maximum Sentence Length

In [9]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen[0].split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [10]:
data_X

array([['The thing disgusting White woman groid White woman drags White child filth '],
       ['Americans acting like know talking '],
       ['Also intrested check webpage info european american town building '],
       ...,
       ['No truth 88WHITE POWERWHITE VICTORYWHITE PRIDE88'],
       ['4 cyclinder motorcycle Historic vehicle Laurin Klement T B 18991903 YouTube Historic vehicle Torpedo V4 1909 YouTube Historic vehicle Torpedo V4 1909 httpthekneeslidercomimages2012rightsidejpg Handlebar camera mount ride video '],
       ['Hi I thought I leave note wish Southern Gentlemen Ladies happy Robert Edward Lee day ']],
      dtype=object)

# Without POS + Stemming

In [11]:
max_len = max_sen_length(data_X)
vocab_size = 1000
embedding_vector_features = 100

In [12]:
onehot_enc = [one_hot(sen[0], vocab_size) for sen in data_X]
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=max_len)

In [13]:
X = embed_repr
Y = data_Y

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=4)

In [19]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=max_len))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(attention(return_sequences=True))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 151, 100)          100000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 151, 200)          160800    
_________________________________________________________________
attention_1 (attention)      (None, 151, 200)          351       
_________________________________________________________________
unified_lstm_3 (UnifiedLSTM) (None, 100)               120400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 381,652
Trainable params: 381,652
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(x_train,y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe58341a2e8>

In [21]:
y_pred = model.predict_classes(x_test)
print(confusion_matrix(y_test, y_pred))
print("Accuracy Socre: ",accuracy_score(y_test, y_pred))
print("F1 score: ",f1_score(y_test, y_pred))
print("Precision: ",precision_score(y_test, y_pred))
print("Recall: ",recall_score(y_test, y_pred))

[[1708  207]
 [ 192   81]]
Accuracy Socre:  0.8176416819012797
F1 score:  0.28877005347593576
Precision:  0.28125
Recall:  0.2967032967032967


# POS

In [22]:
max_len = max_sen_length(data1_X)
vocab_size = 1000
embedding_vector_features = 100

In [23]:
onehot_enc = [one_hot(sen[0], vocab_size) for sen in data1_X]
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=max_len)

In [24]:
X = embed_repr
Y = data1_Y

In [25]:
x1_train, x1_test, y1_train, y1_test = train_test_split(X, Y, test_size=0.2, random_state=4)

In [26]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(attention(return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 144, 100)          100000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 144, 64)           34048     
_________________________________________________________________
attention_2 (attention)      (None, 144, 64)           208       
_________________________________________________________________
unified_lstm_5 (UnifiedLSTM) (None, 32)                12416     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 146,705
Trainable params: 146,705
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(x1_train,y1_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe5745f6390>

In [31]:
y1_pred = model.predict_classes(x1_test)
print(confusion_matrix(y1_test, y1_pred))
print("Accuracy Socre: ",accuracy_score(y1_test, y1_pred))
print("F1 score: ",f1_score(y1_test, y1_pred))
print("Precision: ",precision_score(y1_test, y1_pred))
print("Recall: ",recall_score(y1_test, y1_pred))

[[1674  177]
 [ 243   59]]
Accuracy Socre:  0.8049233627496516
F1 score:  0.21933085501858737
Precision:  0.25
Recall:  0.19536423841059603


# POS + Stemming

In [32]:
max_len = max_sen_length(data2_X)
vocab_size = 1000
embedding_vector_features = 100

In [33]:
onehot_enc = [one_hot(sen[0], vocab_size) for sen in data2_X]
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=max_len)

In [34]:
X = embed_repr
Y = data2_Y

In [35]:
x2_train, x2_test, y2_train, y2_test = train_test_split(X, Y, test_size=0.2, random_state=4)

In [36]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(attention(return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile('adam', 'binary_crossentropy')

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 143, 100)          100000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 143, 64)           34048     
_________________________________________________________________
attention_3 (attention)      (None, 143, 64)           207       
_________________________________________________________________
unified_lstm_7 (UnifiedLSTM) (None, 32)                12416     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 146,704
Trainable params: 146,704
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x2_train,y2_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe565ceaef0>

In [None]:
y2_pred = model.predict_classes(x2_test)
print(confusion_matrix(y2_test, y2_pred))
print("Accuracy Socre: ",accuracy_score(y2_test, y2_pred))
print("F1 score: ",f1_score(y2_test, y2_pred))
print("Precision: ",precision_score(y2_test, y2_pred))
print("Recall: ",recall_score(y2_test, y2_pred))