In [62]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
from collections import Counter
import tensorflow_addons as tfa

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/anry/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [63]:
#https://medium.com/swlh/multi-label-text-classification-with-scikit-learn-and-tensorflow-257f9ee30536
def decontract(sentence):
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def removePunctuation(sentence): 
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    return sentence

def removeNumber(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    stop_words = set(stopwords.words('english'))
    sentence = sentence.split(' ')
    filtered_sentence = [w for w in sentence if not w.lower() in stop_words]
    return " ".join(filtered_sentence)
    

def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

In [83]:
datasets = pd.read_csv('../datasets/train_preprocessed.csv').dropna()
X_train,y_train = datasets["content"].astype("string"),datasets["sentiment"].astype("category").cat.codes

maxlen = max(X_train.apply(lambda x:len(x)))+1

In [65]:
# X_train = X_train.apply(lambda x: decontract(x))
# X_train = X_train.apply(lambda x: removePunctuation(x))
# X_train = X_train.apply(lambda x: removeNumber(x))
# X_train = X_train.apply(lambda x: removeStopWords(x))
# X_train = X_train.apply(lambda x: stemming(x))

In [84]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, lower=True)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=maxlen)

In [55]:
# # #class weights
# # counts = Counter(y_train)
# # weights = {i:1/j for i,j in counts.items()}
# #f1 loss
# def macro_double_soft_f1(y, y_hat):
#     """Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
#     Use probability values instead of binary predictions.
#     This version uses the computation of soft-F1 for both positive and negative class for each label.
    
#     Args:
#         y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
#         y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        
#     Returns:
#         cost (scalar Tensor): value of the cost function for the batch
#     """
#     y = tf.cast(y, tf.float32)
#     y_hat = tf.cast(y_hat, tf.float32)
#     tp = tf.reduce_sum(y_hat * y, axis=0)
#     fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
#     fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
#     tn = tf.reduce_sum((1 - y_hat) * (1 - y), axis=0)
#     soft_f1_class1 = 2*tp / (2*tp + fn + fp + 1e-16)
#     soft_f1_class0 = 2*tn / (2*tn + fn + fp + 1e-16)
#     cost_class1 = 1 - soft_f1_class1 # reduce 1 - soft-f1_class1 in order to increase soft-f1 on class 1
#     cost_class0 = 1 - soft_f1_class0 # reduce 1 - soft-f1_class0 in order to increase soft-f1 on class 0
#     cost = 0.5 * (cost_class1 + cost_class0) # take into account both class 1 and class 0
#     macro_cost = tf.reduce_mean(cost) # average on all labels
#     return macro_cost

In [67]:
import tensorflow as tf
from tensorflow.keras.layers import Flatten, LSTM, Input,Embedding,Dense
from tensorflow.keras.models import Model

max_words =10000

deep_inputs = Input(shape=(maxlen,))

embedding_layer = Embedding(max_words,120, trainable=True,)(deep_inputs)# weights=[embedding_matrix],


LSTM_Layer_1 = LSTM(
    units = 120,
    activation="tanh",
    name = 'lstm_layer_1',
    recurrent_activation="sigmoid",
    use_bias=True,
    dropout=0.1,
    return_sequences=True)(embedding_layer)

LSTM_Layer_2 = LSTM(
    units = 120,
    activation="tanh",
    name = 'lstm_layer_2',
    recurrent_activation="sigmoid",
    use_bias=True,
    dropout=0.1)(LSTM_Layer_1)


dense_layer_1 = Dense(13, activation='softmax')(LSTM_Layer_2)#softmax because we have multi-class classification
model = Model(inputs=deep_inputs, outputs=dense_layer_1)


callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath="./checkpoints/lstm_augmented/",save_best_only=True,save_weights_only=False)
]

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
              metrics=[  
                  tf.keras.metrics.TruePositives(name='tp'),
                  tf.keras.metrics.FalsePositives(name='fp'),
                  tf.keras.metrics.TrueNegatives(name='tn'),
                  tf.keras.metrics.FalseNegatives(name='fn'), 
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall'),
                  tf.keras.metrics.CategoricalAccuracy(name='acc'),
                  tf.keras.metrics.AUC(name='auc'),
                  tfa.metrics.F1Score(num_classes=13,average='macro',threshold = 0.5)
              ])

In [68]:
model.fit(X_train,pd.get_dummies(y_train),validation_split=0.2,batch_size=32,epochs = 2,workers = -1,callbacks = callbacks)

Epoch 1/2



INFO:tensorflow:Assets written to: ./checkpoints/lstm_augmented/assets


INFO:tensorflow:Assets written to: ./checkpoints/lstm_augmented/assets


Epoch 2/2



INFO:tensorflow:Assets written to: ./checkpoints/lstm_augmented/assets


INFO:tensorflow:Assets written to: ./checkpoints/lstm_augmented/assets




<keras.callbacks.History at 0x7f8a4111fd30>

In [69]:
model.save_weights('./weights/lstm_augmented/saved_weights.h5')

In [70]:
tf.keras.models.save_model(model,'./full_models/lstm_augmented')



INFO:tensorflow:Assets written to: ./full_models/lstm_augmented/assets


INFO:tensorflow:Assets written to: ./full_models/lstm_augmented/assets


# Metrics

In [85]:
from sklearn.metrics import classification_report
import numpy as np

datasets = pd.read_csv('../datasets/test_preprocessed.csv').dropna()
X_test,y_test = datasets["content"].astype("string"),datasets["sentiment"].astype("category").cat.codes
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [88]:
#normal lstm
model.load_weights('weights/lstm_without_weights/saved_weights.h5')
y_pred = tf.argmax(model.predict(X_test),axis = 1)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.00      0.00      0.00       118
           2       0.00      0.00      0.00       538
           3       0.00      0.00      0.00       501
           4       0.00      0.00      0.00       586
           5       0.15      0.26      0.19      1719
           6       0.00      0.00      0.00       873
           7       0.14      0.10      0.12      1268
           8       0.25      0.50      0.33      2825
           9       0.00      0.00      0.00      1007
          10       0.13      0.06      0.09      1704
          11       0.00      0.00      0.00       722
          12       0.21      0.32      0.25      2790

    accuracy                           0.20     14724
   macro avg       0.07      0.10      0.08     14724
weighted avg       0.13      0.20      0.15     14724



In [89]:
#weighted lstm
model.load_weights('weights/lstm_with_weights/saved_weights.h5')
y_pred = tf.argmax(model.predict(X_test),axis = 1)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.01      0.04      0.01        73
           1       0.01      0.08      0.02       118
           2       0.03      0.03      0.03       538
           3       0.00      0.00      0.00       501
           4       0.04      0.18      0.07       586
           5       0.15      0.05      0.08      1719
           6       0.09      0.03      0.04       873
           7       0.12      0.24      0.16      1268
           8       0.26      0.38      0.31      2825
           9       0.07      0.05      0.06      1007
          10       0.14      0.15      0.14      1704
          11       0.07      0.03      0.04       722
          12       0.21      0.01      0.02      2790

    accuracy                           0.13     14724
   macro avg       0.09      0.10      0.08     14724
weighted avg       0.15      0.13      0.11     14724



In [90]:
#f1 loss lstm
model.load_weights('weights/lstm_f1_loss/saved_weights.h5')
y_pred = tf.argmax(model.predict(X_test),axis = 1)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.00      0.00      0.00       118
           2       0.00      0.00      0.00       538
           3       0.00      0.00      0.00       501
           4       0.05      0.06      0.05       586
           5       0.14      0.20      0.17      1719
           6       0.14      0.00      0.01       873
           7       0.16      0.09      0.12      1268
           8       0.24      0.42      0.30      2825
           9       0.09      0.07      0.08      1007
          10       0.13      0.09      0.10      1704
          11       0.03      0.02      0.03       722
          12       0.22      0.26      0.24      2790

    accuracy                           0.18     14724
   macro avg       0.09      0.09      0.08     14724
weighted avg       0.15      0.18      0.15     14724



In [91]:
#augmented lstm
model.load_weights('weights/lstm_f1_loss/saved_weights.h5')
y_pred = tf.argmax(model.predict(X_test),axis = 1)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        73
           1       0.00      0.00      0.00       118
           2       0.00      0.00      0.00       538
           3       0.00      0.00      0.00       501
           4       0.05      0.06      0.05       586
           5       0.14      0.20      0.17      1719
           6       0.14      0.00      0.01       873
           7       0.16      0.09      0.12      1268
           8       0.24      0.42      0.30      2825
           9       0.09      0.07      0.08      1007
          10       0.13      0.09      0.10      1704
          11       0.03      0.02      0.03       722
          12       0.22      0.26      0.24      2790

    accuracy                           0.18     14724
   macro avg       0.09      0.09      0.08     14724
weighted avg       0.15      0.18      0.15     14724

