In [57]:
import keras
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import dataframe_image as dfi

In [58]:
train_data_path = 'data/train_4000.csv'
test_data_path = 'data/test_4000.csv'
val_data_path = 'data/val_4000.csv'

train_data_path_news = 'data/train_4000_news.csv'
test_data_path_news = 'data/test_4000_news.csv'
val_data_path_news = 'data/val_4000_news.csv'

train_data_path_tweets = 'data/train_4000_tweets.csv'
test_data_path_tweets = 'data/test_4000_tweets.csv'
val_data_path_tweets = 'data/val_4000_tweets.csv'

first_n_words = 200

# Read raw data
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path)
df_val = pd.read_csv(val_data_path)

df_train_news = pd.read_csv(train_data_path_news)
df_test_news = pd.read_csv(test_data_path_news)
df_val_news = pd.read_csv(val_data_path_news)
df_train_news[['label']] = df_train_news[['label']].replace(["negative", "positive"],[0, 1])
df_test_news[['label']] = df_test_news[['label']].replace(["negative", "positive"],[0, 1])
df_val_news[['label']] = df_val_news[['label']].replace(["negative", "positive"],[0, 1])


df_train_tweets = pd.read_csv(train_data_path_tweets)
df_test_tweets = pd.read_csv(test_data_path_tweets)
df_val_tweets = pd.read_csv(val_data_path_tweets)
df_train_tweets[['label']] = df_train_tweets[['label']].replace(["negative", "positive"],[0, 1])
df_test_tweets[['label']] = df_test_tweets[['label']].replace(["negative", "positive"],[0, 1])
df_val_tweets[['label']] = df_val_tweets[['label']].replace(["negative", "positive"],[0, 1])

# Take particular columns
train_sentences = df_train['sequence'].values
train_labels = df_train['label'].values

test_sentences = df_test['sequence'].values
test_labels = df_test['label'].values

val_sentences = df_val['sequence'].values
val_labels = df_val['label'].values

train_sentences_news = df_train_news['sequence'].values
train_labels_news = df_train_news['label'].values

test_sentences_news = df_test_news['sequence'].values
test_labels_news = df_test_news['label'].values

val_sentences_news = df_val_news['sequence'].values
val_labels_news = df_val_news['label'].values

train_sentences_tweets = df_train_tweets['sequence'].values
train_labels_tweets = df_train_tweets['label'].values

test_sentences_tweets = df_test_tweets['sequence'].values
test_labels_tweets = df_test_tweets['label'].values

val_sentences_tweets = df_val_tweets['sequence'].values
val_labels_tweets = df_val_tweets['label'].values


# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 100 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

def prepare_padded_data(sentences):
  # convert dataset to sequence and pad sequences
  all_sequences = tokenizer.texts_to_sequences(sentences)
  all_padded = pad_sequences(all_sequences, padding='post', maxlen=max_length)
  return all_sequences, all_padded

# Get all sentences squences and padding
train_sequences, train_padded = prepare_padded_data(train_sentences)
test_sequences, test_padded = prepare_padded_data(test_sentences)
val_sequences, val_padded = prepare_padded_data(val_sentences)

train_sequences_news, train_padded_news = prepare_padded_data(train_sentences_news)
test_sequences_news, test_padded_news = prepare_padded_data(test_sentences_news)
val_sequences_news, val_padded_news = prepare_padded_data(val_sentences_news)

train_sequences_tweets, train_padded_tweets = prepare_padded_data(train_sentences_tweets)
test_sequences_tweets, test_padded_tweets = prepare_padded_data(test_sentences_tweets)
val_sequences_tweets, val_padded_tweets = prepare_padded_data(val_sentences_tweets)


In [59]:
all_results = []

def calculate_scores(padded_sequence, true_labels, required_set, dataset):
  # Checking reconstruct the model identically.
  model_check = keras.models.load_model("checkpoints/lstm-base-uncased_4000_0_best.h5")

  prediction = model_check.predict(padded_sequence)
  # Get labels based on probability 1 if p>= 0.5 else 0

  pred_labels = []
  for i in prediction:
      if i >= 0.5:
          pred_labels.append(1)
      else:
          pred_labels.append(0)
  acc = accuracy_score(true_labels, pred_labels)
  f1score = f1_score(true_labels, pred_labels)
  pre = precision_score(true_labels, pred_labels)
  rec = recall_score(true_labels, pred_labels)


  dict = {'Dataset': dataset, 'TTV': required_set, 'Accuracy': acc, 'F1 Score': f1score, 'Precision': pre, 'Recall': rec}
  all_results.append(dict)

In [60]:
calculate_scores(train_padded, train_labels, 'Train', 'All Data')
calculate_scores(test_padded, test_labels, 'Test', 'All Data')
calculate_scores(val_padded, val_labels, 'Val', 'All Data')



In [61]:
calculate_scores(train_padded_news, train_labels_news, 'Train', 'NewsMTSC Split')
calculate_scores(test_padded_news, test_labels_news, 'Test', 'NewsMTSC Split')
calculate_scores(val_padded_news, val_labels_news, 'Val', 'NewsMTSC Split')



In [62]:
calculate_scores(train_padded_tweets, train_labels_tweets, 'Train', 'Sentiment140 Split')
calculate_scores(test_padded_tweets, test_labels_tweets, 'Test', 'Sentiment140 Split')
calculate_scores(val_padded_tweets, val_labels_tweets, 'Val', 'Sentiment140 Split')



In [63]:
all_results

[{'Dataset': 'All Data',
  'TTV': 'Train',
  'Accuracy': 0.8907142857142857,
  'F1 Score': 0.8760631834750909,
  'Precision': 0.9173027989821882,
  'Recall': 0.8383720930232558},
 {'Dataset': 'All Data',
  'TTV': 'Test',
  'Accuracy': 0.704375,
  'F1 Score': 0.6451612903225806,
  'Precision': 0.7142857142857143,
  'Recall': 0.5882352941176471},
 {'Dataset': 'All Data',
  'TTV': 'Val',
  'Accuracy': 0.6825,
  'F1 Score': 0.6231454005934719,
  'Precision': 0.7094594594594594,
  'Recall': 0.5555555555555556},
 {'Dataset': 'NewsMTSC Split',
  'TTV': 'Train',
  'Accuracy': 0.8892857142857142,
  'F1 Score': 0.8617305976806422,
  'Precision': 0.9036482694106641,
  'Recall': 0.8235294117647058},
 {'Dataset': 'NewsMTSC Split',
  'TTV': 'Test',
  'Accuracy': 0.69375,
  'F1 Score': 0.6067415730337079,
  'Precision': 0.6823104693140795,
  'Recall': 0.546242774566474},
 {'Dataset': 'NewsMTSC Split',
  'TTV': 'Val',
  'Accuracy': 0.6925,
  'F1 Score': 0.594059405940594,
  'Precision': 0.708661417322

In [64]:
df = pd.DataFrame(all_results)

In [65]:
df

Unnamed: 0,Dataset,TTV,Accuracy,F1 Score,Precision,Recall
0,All Data,Train,0.890714,0.876063,0.917303,0.838372
1,All Data,Test,0.704375,0.645161,0.714286,0.588235
2,All Data,Val,0.6825,0.623145,0.709459,0.555556
3,NewsMTSC Split,Train,0.889286,0.861731,0.903648,0.823529
4,NewsMTSC Split,Test,0.69375,0.606742,0.68231,0.546243
5,NewsMTSC Split,Val,0.6925,0.594059,0.708661,0.511364
6,Sentiment140 Split,Train,0.892143,0.887982,0.928627,0.850746
7,Sentiment140 Split,Test,0.715,0.678873,0.741538,0.625974
8,Sentiment140 Split,Val,0.6725,0.6469,0.710059,0.594059


In [66]:
df = pd.DataFrame(columns=pd.MultiIndex.from_product([['All Data', 'Sentiment140 Split', 'NewsMTSC Split'], ['Accuracy', 'F1']]),
                  index=['Train', 'Val', 'Test'])

# Add data to the dataframe
for d in all_results:
    df.loc[d['TTV'], (d['Dataset'], 'Accuracy')] = d['Accuracy'].astype(str)[:4]
    df.loc[d['TTV'], (d['Dataset'], 'F1')] = d['F1 Score'].astype(str)[:4]
    # df.loc[d['TTV'], (d['Dataset'], 'Precision')] = d['Precision']
    # df.loc[d['TTV'], (d['Dataset'], 'Recall')] = d['Recall']



# Reset the index to default
df = df.rename(index={'Val': 'Validation'})
df = df.rename_axis('Data Split')

In [67]:
df

Unnamed: 0_level_0,All Data,All Data,Sentiment140 Split,Sentiment140 Split,NewsMTSC Split,NewsMTSC Split
Unnamed: 0_level_1,Accuracy,F1,Accuracy,F1,Accuracy,F1
Data Split,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Train,0.89,0.87,0.89,0.88,0.88,0.86
Validation,0.68,0.62,0.67,0.64,0.69,0.59
Test,0.7,0.64,0.71,0.67,0.69,0.6


In [68]:
df = df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
df.set_properties(**{'text-align': 'center'})
dfi.export(df, 'results/lstm_results_table.png')

objc[7953]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x23c14b1a8) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/112.0.5615.49/Libraries/libGLESv2.dylib (0x107ac7c58). One of the two will be used. Which one is undefined.
26930 bytes written to file /var/folders/0f/x7d___yn57q2g_3z88pt79c00000gn/T/tmpygvaxdmn/temp.png
