# LSTM with Attention

In [1]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
  
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abuinoschi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abuinoschi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read data and preprocess it

In [3]:
train_data = pd.read_csv('../dataset/train.tsv', sep='\t')
train_data = train_data.fillna('unknown')

validation_data = pd.read_csv('../dataset/test.tsv', sep='\t')
validation_data = validation_data.fillna('unknown')

In [4]:
train_data['q1_label'].value_counts()

yes    4428
no     1977
Name: q1_label, dtype: int64

In [5]:
train_data['q2_label'].value_counts()

no         3902
unknown    2024
yes         479
Name: q2_label, dtype: int64

In [6]:
train_data['q3_label'].value_counts()

yes        4315
unknown    1962
no          128
Name: q3_label, dtype: int64

In [7]:
train_data['q4_label'].value_counts()

no         3745
unknown    1972
yes         688
Name: q4_label, dtype: int64

In [8]:
# One hot encoding
train_data = pd.get_dummies(train_data, columns=['q2_label', 'q3_label', 'q4_label'], prefix=["q2_label_is", "q3_label_is", "q4_label_is"] )

# One hot encoding
validation_data = pd.get_dummies(validation_data, columns=['q2_label', 'q3_label', 'q4_label'], prefix=["q2_label_is", "q3_label_is", "q4_label_is"] )

In [9]:
train_data.head(2)

Unnamed: 0,tweet_no,tweet_text,q1_label,q5_label,q6_label,q7_label,language,tweet_link,tweet_link_count,preprocessed_tweet_text,...,preprocessed_tweet_text_no_link,q2_label_is_no,q2_label_is_unknown,q2_label_is_yes,q3_label_is_no,q3_label_is_unknown,q3_label_is_yes,q4_label_is_no,q4_label_is_unknown,q4_label_is_yes
0,1,For the average American the best way to tell ...,no,unknown,no,no,en,[],0,For the average American the best way to tell ...,...,For the average American the best way to tell ...,0,1,0,0,1,0,0,1,0
1,2,this is fucking bullshit,no,unknown,no,no,en,[],0,this is fucking bullshit,...,this is fucking bullshit,0,1,0,0,1,0,0,1,0


In [10]:
train_data = train_data[['preprocessed_tweet_text', 'q1_label', 
                         'q2_label_is_no', 'q2_label_is_unknown', 'q2_label_is_yes', 
                         'q3_label_is_no', 'q3_label_is_unknown', 'q3_label_is_yes', 
                         'q4_label_is_no', 'q4_label_is_unknown', 'q4_label_is_yes']]

In [11]:
validation_data = validation_data[['preprocessed_tweet_text', 'q1_label', 
                         'q2_label_is_no', 'q2_label_is_unknown', 'q2_label_is_yes', 
                         'q3_label_is_no', 'q3_label_is_unknown', 'q3_label_is_yes', 
                         'q4_label_is_no', 'q4_label_is_unknown', 'q4_label_is_yes']]

In [12]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
  
def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if  not word in stop_words]
    text = " ".join(text)
    return text

training_data = train_data
training_data['preprocessed_tweet_text'] = training_data.preprocessed_tweet_text.apply(lambda x: clean_text(x))
validation_data['preprocessed_tweet_text'] = validation_data.preprocessed_tweet_text.apply(lambda x: clean_text(x))
training_data.head()

Unnamed: 0,preprocessed_tweet_text,q1_label,q2_label_is_no,q2_label_is_unknown,q2_label_is_yes,q3_label_is_no,q3_label_is_unknown,q3_label_is_yes,q4_label_is_no,q4_label_is_unknown,q4_label_is_yes
0,average american best way tell covid19 cough r...,no,0,1,0,0,1,0,0,1,0
1,fuck bullshit,no,0,1,0,0,1,0,0,1,0
2,yall please follow government instruction knoc...,no,0,1,0,0,1,0,0,1,0
3,offense corona virus disappear april actually ...,no,0,1,0,0,1,0,0,1,0
4,face someone spend 9 hour personal protective ...,yes,1,0,0,0,0,1,1,0,0


In [13]:
training_data['q1_label'].value_counts()

yes    4428
no     1977
Name: q1_label, dtype: int64

In [14]:
training_data['q1_label'] = training_data['q1_label'].map({'no': 0, 'yes':1})
validation_data['q1_label'] = validation_data['q1_label'].map({'no': 0, 'yes':1})

In [15]:
VOCAB_SIZE = 10000
MAX_LEN = 128

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(training_data['preprocessed_tweet_text'])
list_tokenized_train = tokenizer.texts_to_sequences(training_data['preprocessed_tweet_text'])
list_tokenized_valid = tokenizer.texts_to_sequences(validation_data['preprocessed_tweet_text'])

X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
y_train = train_data['q1_label']

X_validation = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_valid, maxlen=MAX_LEN)
y_validation = validation_data['q1_label']

## Create model (LSTM + Attention layer) for Q1

In [16]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
          
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
          
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [17]:
sequence_input = tf.keras.layers.Input(shape=(128,), dtype="int32")
embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 128)(sequence_input)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True), name="bi_lstm_0")(embedding)
(lstm, forward_h, forward_c, backward_h, backward_c) = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, return_state=True), name="bi_lstm_1")(lstm)
state_h  = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c =  tf.keras.layers.Concatenate()([forward_c, backward_c])
context_vector, attention_weights = Attention(32)(lstm, state_h)
dense1 = tf.keras.layers.Dense(16, activation="relu")(context_vector)
dropout = tf.keras.layers.Dropout(0.1)(dense1)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)
model = tf.keras.Model(inputs=sequence_input, outputs=output)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
          optimizer=tf.keras.optimizers.Adam(1e-4),
          metrics=['accuracy'])

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 128, 128)     1280000     ['input_1[0][0]']                
                                                                                                  
 bi_lstm_0 (Bidirectional)      (None, 128, 256)     263168      ['embedding[0][0]']              
                                                                                                  
 bi_lstm_1 (Bidirectional)      [(None, 128, 128),   164352      ['bi_lstm_0[0][0]']              
                                 (None, 64),                                                  

In [19]:
history = model.fit(X_train,y_train,
                    batch_size=128,
                    epochs=10,
                    validation_data=(X_validation, y_validation))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
predictions = model.predict(X_validation)

In [21]:
validation_data['predicted_q1'] = predictions
validation_data['predicted_q1'] = validation_data.apply(lambda row: 0 if row['predicted_q1'] < 0.5 else 1, axis=1)

In [22]:
cm = confusion_matrix(validation_data['q1_label'], validation_data['predicted_q1'])
pd.DataFrame(cm, index=['Actual no', 'Actual yes'], columns = ['Predicted no', 'Predicted yes'])

Unnamed: 0,Predicted no,Predicted yes
Actual no,548,280
Actual yes,181,766
