In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import preprocessor as p
from tqdm import tqdm

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('clean_train.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drag kid dysfunct...
1,2,0,thanks lyft credit cant use cause dont offer w...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation


In [3]:
df = df.dropna()

In [4]:
data = df.sample(frac=0.5)

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['label'], test_size=0.30, random_state=42)

In [37]:
print(X_train.shape)
print(X_test.shape)

(11176,)
(4790,)


In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers

max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, maxlen=max_len)

In [41]:
print(X_train.shape)

(11176, 100)


In [42]:
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, maxlen=max_len)
print(X_test.shape)

(4790, 100)


<p style='font-size: 30px;'><br><br><b>Embedding + Uni-Directional LSTM</b><p>

In [48]:
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

model1 = Sequential()
model1.add(layers.Embedding(max_words, 100)) #The embedding layer
model1.add(layers.LSTM(30,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(64,activation='relu'))
model1.add(layers.Dense(32,activation='relu'))
model1.add(layers.Dense(16,activation='relu'))
model1.add(layers.Dense(1,activation='sigmoid'))

model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         500000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 30)                15720     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                1984      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_8 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 17        
Total params: 520,329
Trainable params: 520,329
Non-trainable params: 0
________________________________________________

In [51]:
model1.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=2,validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2265bcca080>

In [52]:
predictions = model1.predict_classes(X_test)

print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))



[[4441   39]
 [ 144  166]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4480
           1       0.81      0.54      0.64       310

    accuracy                           0.96      4790
   macro avg       0.89      0.76      0.81      4790
weighted avg       0.96      0.96      0.96      4790

0.9617954070981211


<p style='font-size: 30px;'><br><br><b>Embedding + Bi-Directional LSTM</b><p>

In [53]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 100)) #The embedding layer
model2.add(layers.Bidirectional(layers.LSTM(30,dropout=0.6))) #Our LSTM layer
model2.add(layers.Dense(64,activation='relu'))
model2.add(layers.Dense(32,activation='relu'))
model2.add(layers.Dense(16,activation='relu'))
model2.add(layers.Dense(1,activation='sigmoid'))

model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         500000    
_________________________________________________________________
bidirectional (Bidirectional (None, 60)                31440     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                3904      
_________________________________________________________________
dense_11 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_12 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 17        
Total params: 537,969
Trainable params: 537,969
Non-trainable params: 0
________________________________________________

In [54]:
model2.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(X_train, y_train, epochs=8,validation_data=(X_test, y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x22663b833c8>

In [56]:
predictions = model2.predict_classes(X_test)

print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))



[[4438   42]
 [ 141  169]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4480
           1       0.80      0.55      0.65       310

    accuracy                           0.96      4790
   macro avg       0.89      0.77      0.81      4790
weighted avg       0.96      0.96      0.96      4790

0.9617954070981211


<p style='font-size: 30px;'><br><br><b>Embedding + 1-D CNN</b><p>

In [59]:
model3 = Sequential()
model3.add(layers.Embedding(max_words, 100, input_length=max_len))
model3.add(layers.Conv1D(30, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(layers.MaxPooling1D(5))
model3.add(layers.Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(layers.GlobalMaxPooling1D())
model3.add(layers.Dense(64,activation='relu'))
model3.add(layers.Dense(32,activation='relu'))
model3.add(layers.Dense(16,activation='relu'))
model3.add(layers.Dense(1,activation='sigmoid'))

model3.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          500000    
_________________________________________________________________
conv1d (Conv1D)              (None, 95, 30)            18030     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 19, 30)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 14, 20)            3620      
_________________________________________________________________
global_max_pooling1d (Global (None, 20)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                1344      
_________________________________________________________________
dense_15 (Dense)             (None, 32)               

In [63]:
model3.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model3.fit(X_train, y_train, epochs=10,validation_data=(X_test, y_test), class_weight ={0:1.0, 1:10.0})

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2266c0f82b0>

In [64]:
predictions = model3.predict_classes(X_test)

print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))



[[4213  267]
 [  94  216]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      4480
           1       0.45      0.70      0.54       310

    accuracy                           0.92      4790
   macro avg       0.71      0.82      0.75      4790
weighted avg       0.94      0.92      0.93      4790

0.9246346555323591
