In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras import backend as K
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout
from tensorflow.keras.models import Sequential
from collections import Counter
from imblearn.over_sampling import SMOTE

# Reading Data

In [51]:
data_raw  = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

In [52]:
data_raw.dropna(inplace=True)

In [53]:
data_raw_X = data_raw["Msg"].to_numpy()
data_raw_Y = data_raw["Tag"].to_numpy()

In [54]:
data_raw_X  = np.reshape(data_raw_X,  (data_raw_X.shape[0],1 ))
data_raw_Y  = np.reshape(data_raw_Y,  (data_raw_Y.shape[0],1 ))

In [55]:
data_raw_Y

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

## Attention Layer

In [56]:
class attention(Layer):    
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
        super(attention,self).__init__()
        
    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")
        super(attention,self).build(input_shape)
        
    def call(self, x):        
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        if self.return_sequences:
            return output
        return K.sum(output, axis=1)

## Maximum Sentence Length

In [57]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        try:
            words = sen[0].split()
        except:
            continue
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

# Raw data + sampling

In [58]:
data_raw_X

array([[' jaydillz my babies pussy is too tight today t co if k v ro'],
       [' seymourblanco they game is over fuck yall bitches amp yall attitudes t co rlrnybfedt '],
       ['can you let me stretch that pussy out or nahhh'],
       ...,
       ['need some hispanic pussy'],
       [' o mygotti you have a girlfriend stop asking these hoes to be your bestfriend '],
       [' kingtunchi jd told me i m to player to be with one bitch']],
      dtype=object)

In [59]:
max_len = max_sen_length(data_raw_X)
vocab_size = 10000
embedding_vector_features = 100

In [60]:
onehot_enc = [one_hot(sen[0], vocab_size) for sen in data_raw_X]
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=max_len)

## Sampling

In [86]:
#print(Counter(data_raw_Y))
oversample = SMOTE()
X, Y = oversample.fit_sample(embed_repr, data_raw_Y)
#print(Counter(Y))

In [94]:
X, Y = embed_repr, data_raw_Y.reshape(len(data_raw_Y))

In [95]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=4)

In [96]:
print(y_train)

[1 1 1 ... 2 2 2]


In [97]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=max_len))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(attention(return_sequences=True))
model.add(LSTM(100))
model.add(Dense(3,activation='softmax'))
model.summary()
model.compile('adam', 'categorical_crossentropy')

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 34, 100)           1000000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 34, 200)           160800    
_________________________________________________________________
attention_7 (attention)      (None, 34, 200)           234       
_________________________________________________________________
unified_lstm_15 (UnifiedLSTM (None, 100)               120400    
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 303       
Total params: 1,281,737
Trainable params: 1,281,737
Non-trainable params: 0
_________________________________________________________________


In [98]:
model.fit(x_train,pd.get_dummies(y_train), validation_data=(x_test,pd.get_dummies(y_test)),epochs=10,batch_size=64)

Train on 19826 samples, validate on 4957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8c44baab70>

In [99]:
pred = model.predict(x_test)
y_pred = list()
for i in range(len(pred)):
    y_pred.append(np.argmax(pred[i]))

## Without Sampling

In [100]:
print("Confusion matrix : \n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

Confusion matrix : 
 [[  54  179   34]
 [  93 3587  164]
 [   9  146  691]]
Accuracy score   :  0.8739156748033085
F1 score         :  0.8669751622880479
Recall           :  0.650724873607034
Precision        :  0.6801179922707868


## After sampling

In [49]:
print("Confusion matrix : \n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

Confusion matrix : 
 [[1955  216 1662]
 [ 143 3560  159]
 [1352  117 2350]]
Accuracy score   :  0.683081466041341
F1 score         :  0.6817830239559092
Recall           :  0.6823969525660969
Precision        :  0.6815141902220474
