In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive/', force_remount=True)
# %cd gdrive/MyDrive/BT4222 Project Group/Codes/

Mounted at /content/gdrive/
/content/gdrive/.shortcut-targets-by-id/1dXwjUxZF5kudup3owQmdj8MrIytqxvzx/BT4222 Project Group/Codes


In [None]:
# imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, f1_score
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from keras import backend as K

In [None]:
# deep learning model classes

class LSTM:
  def __init__(self):
    self.model = None

  def fit(self, X_train, y_train):
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],1)),
        layers.SpatialDropout1D(0.2),
        layers.LSTM(200, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        layers.LSTM(200, recurrent_dropout=0.2, return_sequences=True),
        layers.Dense(300, activation='relu'),
        layers.Dense(300, activation='relu'),
        layers.Dense(1, activation='sigmoid')
      ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
    model.fit(X_train, y_train, batch_size=200, epochs=6, verbose=0)
    self.model = model

  def predict(self, X_test):
    return self.model.predict(X_test)

  def evaluate(self, X_test, y_test):
    return self.model.evaluate(X_test, y_test, verbose=0)

class CNN:
  def __init__(self):
    self.model = None

  def fit(self, X_train, y_train):
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],1)),
        layers.Conv1D(filters=128, kernel_size=5, strides=1, activation='relu', padding='same'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(500, activation='relu'), # FCNN
        layers.Dropout(0.3),
        layers.Dense(500, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid', name = 'Output') # output
      ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])
    model.fit(X_train, y_train, batch_size=200, epochs=6, verbose=0)
    self.model = model

  def predict(self, X_test):
    return self.model.predict(X_test)

  def evaluate(self, X_test, y_test):
    return self.model.evaluate(X_test, y_test, verbose=0)

In [None]:
# common functions

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def embeddings_evaluation(all_embeddings, model):
  score_df = pd.DataFrame()
  for name, embedding in all_embeddings.items():
    if name == 'TF-IDF w/ Bigram' and (type(model) is LSTM or type(model) is CNN):
      continue
    try:
      X_train = embedding['X_train']
      y_train = embedding['y_train']
      X_test = embedding['X_test']
      y_test = embedding['y_test']
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      acc = accuracy_score(y_test, y_pred)
      f1 = f1_score(y_test, y_pred, average='weighted')
    except ValueError as e:
      scores = model.evaluate(X_test, y_test)
      acc = scores[1]
      f1 = scores[2]
    except Exception as e:
      print(e)
    finally:
      score_df = score_df.append({'Embedding': name, 'Accuracy': acc, 'F1-score': f1}, ignore_index = True)
  return score_df

In [None]:
rdn_index = random.sample(range(110248), 40000)

In [None]:
# word2vec: skip-gram
X_train_sg = pd.read_csv('../Word Embedding/emb_sg_train.csv').iloc[rdn_index]
X_test_sg = pd.read_csv('../Word Embedding/emb_sg_test.csv')

# word2vec: cbow
X_train_cbow = pd.read_csv('../Word Embedding/emb_cbow_train.csv').iloc[rdn_index]
X_test_cbow = pd.read_csv('../Word Embedding/emb_cbow_test.csv')

# doc2vec: dbow
X_train_dbow = pd.read_csv('../Word Embedding/emb_dbow_train.csv').iloc[rdn_index]
X_test_dbow = pd.read_csv('../Word Embedding/emb_dbow_test.csv')

# doc2vec: dm
X_train_dm = pd.read_csv('../Word Embedding/emb_dm_train.csv').iloc[rdn_index]
X_test_dm = pd.read_csv('../Word Embedding/emb_dm_test.csv')

# doc2vec: dbow + dm
X_train_dbow_dm = pd.read_csv('../Word Embedding/emb_dbow_dm_train.csv').iloc[rdn_index]
X_test_dbow_dm = pd.read_csv('../Word Embedding/emb_dbow_dm_test.csv')

# google's word2vec
X_train_ggl = pd.read_csv('../Word Embedding/emb_ggl_train.csv').iloc[rdn_index]
X_test_ggl = pd.read_csv('../Word Embedding/emb_ggl_test.csv')

# standford's glove
X_train_glove = pd.read_csv('../Word Embedding/emb_glove_train.csv').iloc[rdn_index]
X_test_glove = pd.read_csv('../Word Embedding/emb_glove_test.csv')

# bert distilled
X_train_bert = pd.read_csv('../Word Embedding/emb_bert_train.csv').iloc[rdn_index]
X_test_bert = pd.read_csv('../Word Embedding/emb_bert_test.csv')

# tf-idf with bigram
train_untokenized_posts = pd.read_csv('../Data/X_train.csv')['processed_str'].to_numpy()
test_untokenized_posts = pd.read_csv('../Data/X_test.csv')['processed_str'].to_numpy()
vectorizer = TfidfVectorizer(ngram_range = (2, 2))
X_train_tfidf = vectorizer.fit_transform(train_untokenized_posts)[rdn_index]
X_test_tfidf = vectorizer.transform(test_untokenized_posts)

# labels
y_train = pd.read_csv('./Data/y_train.csv')['class'].iloc[rdn_index]
y_test = pd.read_csv('./Data/y_test.csv')['class']

In [None]:
# all embeddings
embedding_dict = {
    'Word2Vec: Skip-Gram' : {'X_train': X_train_sg, 'y_train': y_train, 'X_test': X_test_sg, 'y_test': y_test},
    'Word2Vec: CBOW' : {'X_train': X_train_cbow, 'y_train': y_train, 'X_test': X_test_cbow, 'y_test': y_test},
    'Doc2Vec: DBOW' : {'X_train': X_train_dbow, 'y_train': y_train, 'X_test': X_test_dbow, 'y_test': y_test},
    'Doc2Vec: DM' : {'X_train': X_train_dm, 'y_train': y_train, 'X_test': X_test_dm, 'y_test': y_test},
    'Doc2Vec: DBOW+DM' : {'X_train': X_train_dbow_dm, 'y_train': y_train, 'X_test': X_test_dbow_dm, 'y_test': y_test},
    "Google's Word2Vec": {'X_train': X_train_ggl, 'y_train': y_train, 'X_test': X_test_ggl, 'y_test': y_test},
    'GloVe': {'X_train': X_train_glove, 'y_train': y_train, 'X_test': X_test_glove, 'y_test': y_test},
    'Bert-Distilled': {'X_train': X_train_bert, 'y_train': y_train, 'X_test': X_test_bert, 'y_test': y_test},
    'TF-IDF w/ Bigram': {'X_train': X_train_tfidf, 'y_train': y_train, 'X_test': X_test_tfidf, 'y_test': y_test}
}

In [None]:
models_to_evaluate = {
    'LSTM': LSTM(),
    'CNN': CNN(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'SVM': SVC(),
  }

In [None]:
evaluation_result = {}

for model_name, model in models_to_evaluate.items():
  scores = embeddings_evaluation(embedding_dict, model)
  print(model_name)
  print(scores, '\n')
  evaluation_result[model_name] = scores

In [None]:
# write the result into a file
with open('Embedding Results.txt', 'a') as the_file:
  for model_name, scores in evaluation_result.items():
    the_file.write(f'{model_name}\n')
    the_file.write(f'{scores}\n\n')

In [None]:
# After transferring scores into excel sheet, read it in
import pandas as pd
embedding_scores_df = pd.read_excel('../Embedding Results.xlsx', header = [0,1])
embedding_scores_df

Unnamed: 0_level_0,Model,LSTM,LSTM,CNN,CNN,Random Forest,Random Forest,Logistic Regression,Logistic Regression,SVM,SVM
Unnamed: 0_level_1,Metrics,Accuracy,F1-Score,Accuracy,F1-Score,Accuracy,F1-Score,Accuracy,F1-Score,Accuracy,F1-Score
0,Word2Vec: Skip-Gram,0.847859,0.847525,0.874537,0.826418,0.901277,0.900677,0.908388,0.907801,0.882991,0.880982
1,Word2Vec: CBOW,0.865441,0.848034,0.866338,0.821691,0.900551,0.900214,0.906719,0.906069,0.902438,0.901698
2,Doc2Vec: DBOW,0.607503,0.000293,0.607612,0.000841,0.669763,0.657508,0.631413,0.585238,0.607503,0.459172
3,Doc2Vec: DM,0.651267,0.661997,0.677636,0.543574,0.696575,0.685189,0.80201,0.793093,0.789892,0.787094
4,Doc2Vec: DBOW+DM,0.63585,0.803589,0.607612,0.000841,0.707931,0.691726,0.805566,0.797999,0.778245,0.75821
5,Google's Word2Vec,0.708747,0.80362,0.816523,0.763112,0.852224,0.849381,0.886002,0.884934,0.894021,0.893228
6,GloVe,0.798756,0.703097,0.780785,0.729949,0.842718,0.840331,0.854038,0.851934,0.856687,0.854773
7,Bert-Distilled,0.607503,0.810928,0.76395,0.608729,0.859988,0.858037,0.903635,0.903306,0.895399,0.894817
8,TF-IDF w/ Bigram,,,,,0.858719,0.856053,0.851462,0.847206,0.856687,0.85275
