In [210]:
import pandas as pd
import numpy as np


import itertools
from random import sample
from tqdm import tqdm_notebook


from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Bidirectional, Embedding
from keras import Model, Sequential
from keras.layers import Input, Dense, Dropout, Lambda, Subtract
from keras import backend as K

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

In [63]:
tweets_data = pd.read_csv('./cleaned_tweets.csv')

In [64]:
tweets_data.dropna(inplace=True)

In [65]:
tweets_data.shape

(5907, 2)

In [66]:
tweets_data.head()

Unnamed: 0,final_cleaned_content,annotation.labels
0,pdhl poros ke ini sebenarnya mau nelikung...,joy
1,maap bosmending ngomong realita aja soal k...,joy
2,seandainya sekutu berpaling apa masih tetap...,fear
3,saya dukung pak prabowo capresnamun plihan,joy
4,mencari yang setia dan sepemikiran itu sulit...,trust


In [67]:
tweets_data.nunique()

final_cleaned_content    5907
annotation.labels           9
dtype: int64

In [68]:
tweets_data['annotation.labels'].value_counts()

joy             1576
trust           1309
anticipation     873
anger            797
disgust          448
fear             423
sadness          405
surprise          70
other              6
Name: annotation.labels, dtype: int64

In [165]:
# Creating hold-out set for final evaluation. For "other" class took 2 samples and for remaining classes took 10 samples from each class
labels = tweets_data['annotation.labels'].unique()
temp_df = []
for label in labels:
    if label == 'other':
        temp_df.append(tweets_data[tweets_data['annotation.labels'] == label].sample(frac=1).iloc[:2])
    else:
        temp_df.append(tweets_data[tweets_data['annotation.labels'] == label].sample(frac=1).iloc[:10])


test_set = pd.concat(temp_df).reset_index()

In [166]:
# Creating train set, which is all the tweets which are not present in the test set
train_set = tweets_data[~tweets_data['final_cleaned_content'].isin(test_set['final_cleaned_content'])]

In [167]:
x_train, y_train = train_set['final_cleaned_content'], train_set['annotation.labels']
x_test, y_test = test_set['final_cleaned_content'], test_set['annotation.labels']

In [168]:
# LabelEncoding
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [169]:
test_set['annotation.labels'].value_counts()

joy             10
trust           10
fear            10
surprise        10
anger           10
anticipation    10
sadness         10
disgust         10
other            2
Name: annotation.labels, dtype: int64

In [170]:
train_set.shape

(5825, 2)

In [171]:
# Creating training data pairs for learning similarity metric for Siamese network
tweets_left = []
tweets_right = []
target = []

labels = train_set['annotation.labels'].unique()
for label in labels:
    # 1) create similar categories pairs, with a corresponding target of 1
    similar_tweets = train_set[train_set['annotation.labels'] == label]['final_cleaned_content']
    
    # Pick 300 random pairs 
    group_pairs = list(itertools.combinations(similar_tweets, 2)) 
    positive_pairs = sample(group_pairs, 300) if len(group_pairs) >= 300 else group_pairs
    tweets_left.extend([p[0] for p in positive_pairs])
    tweets_right.extend([p[1] for p in positive_pairs])
    target.extend([1.]*len(positive_pairs))
    
    # 2) create pairs of examples with tweets from different categories, with a target set to 0
    other_tweets = train_set[train_set['annotation.labels'] != label]['final_cleaned_content']
    for i in range(len(positive_pairs)):
        tweets_left.append(np.random.choice(similar_tweets))
        tweets_right.append(np.random.choice(other_tweets))
        target.append(0.)

dataset = pd.DataFrame({
        'tweets_left': tweets_left,
        'tweets_right': tweets_right,
        'target': target
    }).sample(frac=1)  # Shuffle dataset

dataset.tail()

Unnamed: 0,tweets_left,tweets_right,target
3188,terus terang anda bilang agama di perjual,tambahin ahhh lg demam akrobatik gaya menj...,1.0
2541,nantikan bersama menteri kelautan dan perikan...,kelihatan kepalanya saja sama jam dinding saja...,1.0
668,terkadang ambisi terlampau besar melebihi ke...,dan saya mau tanya cash bck ribu itu berlaku a...,1.0
2224,lagi kampanye maka semua yang dilakukan sala...,gue pernah ke malaysia hawa disana itu pana...,0.0
4619,hai cfc loversnnkamu pecinta kartun doraemonnp...,sayang jatah gratis ongkir udah abis,0.0


In [183]:
dataset.shape

(4812, 3)

In [188]:
# Loading fasttext wv dictionary

import pickle

with open('./ft_wv.pickle', 'rb') as f_obj:
    wv_dictionary = pickle.load(f_obj)

In [190]:
# Text to sequence

tokenizer = Tokenizer(num_words=10000, lower=True, char_level=False)
tokenizer.fit_on_texts(dataset['tweets_left'].tolist() + dataset['tweets_right'].tolist())
word_seq_train_left = tokenizer.texts_to_sequences(dataset['tweets_left'].tolist())
word_seq_train_right = tokenizer.texts_to_sequences(dataset['tweets_right'].tolist())
word_index = tokenizer.word_index

In [191]:
# Padding

word_seq_train_left = sequence.pad_sequences(word_seq_train_left, maxlen=70)
word_seq_train_right = sequence.pad_sequences(word_seq_train_right, maxlen=70)


In [192]:
x_pairs = [word_seq_train_left, word_seq_train_right]
y_pairs = dataset['target'].values

In [195]:
#embedding matrix

words_not_found = []
nb_words = 10000
embed_dim = 300
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = wv_dictionary.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 1


In [196]:
def exponent_neg_manhattan_distance(arms_difference):
    """ Compute the exponent of the opposite of the L1 norm of a vector, to get the left/right inputs
    similarity from the inputs differences. This function is used to turned the unbounded
    L1 distance to a similarity measure between 0 and 1"""
    return K.exp(-K.sum(K.abs(arms_difference), axis=1, keepdims=True))

def siamese_lstm(max_length, embedding_matrix):
    """ Define, compile and return a siamese LSTM model """
    input_shape = (max_length,)
    left_input = Input(input_shape, name='left_input')
    right_input = Input(input_shape, name='right_input')

    # Define a single sequential model for both arms.
    # In this example I've chosen a simple bidirectional LSTM with no dropout
    seq = Sequential(name='sequential_network')
    seq.add(Embedding(nb_words, 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    seq.add(Bidirectional(LSTM(32, dropout=0., recurrent_dropout=0.)))
    
    left_output = seq(left_input)
    right_output = seq(right_input)

    # Here we subtract the neuron values of the last layer from the left arm 
    # with the corresponding values from the right arm
    subtracted = Subtract(name='pair_representations_difference')([left_output, right_output])
    malstm_distance = Lambda(exponent_neg_manhattan_distance, 
                             name='masltsm_distance')(subtracted)

    siamese_net = Model(inputs=[left_input, right_input], outputs=malstm_distance)
    siamese_net.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
    return siamese_net

siamese_lstm = siamese_lstm(70, embedding_matrix)

# Print a summary of the model mainly to know the number of trainable parameters
siamese_lstm.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         (None, 70)           0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        (None, 70)           0                                            
__________________________________________________________________________________________________
sequential_network (Sequential) (None, 64)           3085248     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
pair_representations_difference (None, 64)           0           sequential_network[1][0]   

In [197]:
siamese_lstm.fit(x_pairs, y_pairs, validation_split=0.2, epochs=6)

Train on 3849 samples, validate on 963 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x1a430076d8>

In [198]:
x_references = tokenizer.texts_to_sequences(x_train.tolist()) # Preprocess the training set examples
x_references = sequence.pad_sequences(x_references, maxlen=70)

def get_prediction(tweet):
    """ Get the predicted tweet class, and the most similar tweet
    in the train set. """
    x = tokenizer.texts_to_sequences([tweet])
    x = sequence.pad_sequences(x, maxlen=70)
    # Compute similarities of the tweet with all tweets in the train set
    similarities = siamese_lstm.predict([[x[0]]*len(x_references), x_references])
    most_similar_index = np.argmax(similarities)
    
    # The predicted category is the one of the most similar example from the train set
    prediction = train_set['annotation.labels'].iloc[most_similar_index]
    most_similar_example = train_set['final_cleaned_content'].iloc[most_similar_index]
    return prediction, most_similar_example

In [212]:
sample_idx = 4
pred, most_sim = get_prediction(x_test[sample_idx])

print(f'Sampled test tweet: {x_test[sample_idx]}')
print(f'True Label: {test_set["annotation.labels"].iloc[sample_idx]}')
print(f'Label prediction: {pred}')
print(f'Most similar example in train set: {most_sim}') 

Sampled test tweet: smg lariss mnis trs bt hisanahfcndn suksess trss berjyaanend smgaa berkmbang biakk sluruh otlett dari sambang sampaii merooukee 
True Label: joy
Label prediction: joy
Most similar example in train set:  assalamualaikum bapakkbrny gmn semoga bapak ganjar sekeluarga diberikan kesehatanbanyak rejekita


In [204]:
# Predicting test accuracy

y_pred = [get_prediction(tweet)[0] for tweet in tqdm_notebook(test_set['final_cleaned_content'])]
accuracy = accuracy_score(le.transform(y_pred), y_test)

print(f'Test accuracy (siamese model): {100*accuracy:.2f} %')

HBox(children=(IntProgress(value=0, max=82), HTML(value='')))


Test accuracy (siamese model): 21.95 %


In [205]:
# Computing f1 score of the hold out set
f1_score = f1_score(le.transform(y_pred), y_test, average='macro')

In [206]:
f1_score

0.1724190818927661

In [208]:
# Classification Report
print(classification_report(y_pred, le.inverse_transform(y_test)))

              precision    recall  f1-score   support

       anger       0.20      0.18      0.19        11
anticipation       0.40      0.27      0.32        15
     disgust       0.20      0.40      0.27         5
        fear       0.00      0.00      0.00         4
         joy       0.50      0.25      0.33        20
       other       0.00      0.00      0.00         1
     sadness       0.20      0.22      0.21         9
    surprise       0.00      0.00      0.00         1
       trust       0.30      0.19      0.23        16

   micro avg       0.22      0.22      0.22        82
   macro avg       0.20      0.17      0.17        82
weighted avg       0.31      0.22      0.25        82

