In [1]:
import numpy as np 
import pandas as pd
import collections
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn import utils

import gensim
import gensim.downloader as api
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import common_texts, get_tmpfile
from gensim.utils import simple_preprocess
from gensim import corpora
from pprint import pprint

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.models import load_model, save_model
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score


import string
import re
import logging
import pickle
from multiprocessing import cpu_count
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
def load_data():
  file_path = '/content/drive/My Drive/nlp_dataset/P1_training.xlsx'
  df = pd.read_excel(file_path)
  file_path_test = '/content/drive/My Drive/nlp_dataset/P1_testing.xlsx'
  df_test = pd.read_excel(file_path_test)
  return (df,df_test)

In [3]:
class TextProcessing(BaseEstimator, TransformerMixin):

  def remove_punctuations(self, input_text):
    punctuations = string.punctuation
    translation_tab = str.maketrans(punctuations, len(punctuations)*" ")
    return input_text.translate(translation_tab)

  def remove_digits(self, input_text):
    return re.sub('\d+', '', input_text)
  
  def converto_lower(self, input_text):
    return input_text.lower()

  def fit(self, X, y=None, **fit_params):
    return self
  
  def transform(self, X, **transform_params):
    return X.apply(self.remove_punctuations).apply(self.remove_digits).apply(self.converto_lower) #.apply(self.remove_stopwords).apply(self.stemming)    #.apply(self.lemmatization)

In [4]:
def tfidf_word_embedding(df):
  clean_text = TextProcessing()
  df_clean = clean_text.fit_transform(df['sentence'])
  X_train = df_clean
  y_train = df['label']

  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer.fit(X_train)
  X_train_tfidf = tfidf_vectorizer.transform(X_train)
  
  #grid search for logistic regression
  param_grid_LR = {'C':[0.01, 0.05, 0.25, 0.5, 0.1]}
  clf_LR = GridSearchCV(LogisticRegression(max_iter = 500, class_weight='balanced'), param_grid=param_grid_LR)  
  clf_LR.fit(X_train_tfidf, y_train)

  model = LogisticRegression(C=clf_LR.best_params_['C'], max_iter=500, class_weight='balanced')
  model.fit(X_train_tfidf, y_train)

  return (tfidf_vectorizer, model) 

In [5]:
def compute_avg_vector(w2v_dict, sentence):
  list_of_word_vectors = [w2v_dict[w] for w in sentence if w in w2v_dict.vocab.keys()]
  if len(list_of_word_vectors) == 0:
    result = [0.0]*300
  else:
    result = np.sum(list_of_word_vectors, axis=0)/len(list_of_word_vectors)

    return result

In [6]:
def glove_w2v_embedding(df):
  clean_text = TextProcessing()
  df_clean = clean_text.fit_transform(df['sentence'])
  X_train, y_train = df_clean, df['label']

  sentences = X_train.values.tolist()
  tokenized_list = [simple_preprocess(sentence) for sentence in sentences]

  X_train_w2v = pd.Series(tokenized_list).apply(lambda x: 
                                              compute_avg_vector(glove_w2v_model, x))
  X_train_w2v = pd.DataFrame(X_train_w2v.values.tolist(), index=X_train.index)

  #grid search
  param_grid_LR = {'C':[0.01, 0.05, 0.25, 0.5, 0.1]}
  clf_LR = GridSearchCV(LogisticRegression(max_iter = 500, class_weight='balanced'), param_grid=param_grid_LR)
  clf_LR.fit(X_train_w2v, y_train)
  
  model = LogisticRegression(C=clf_LR.best_params_['C'], max_iter=500, class_weight='balanced')
  model.fit(X_train_w2v, y_train)

  return model

In [7]:
def universal_sentence_encoder(df, classification='lr', plot_training=False):

  X_train, y_train = df['sentence'], df['label']
  X_train_vectors = []
  for r in X_train:
    emb = embed([r])
    sentence_emb = tf.reshape(emb, [-1]).numpy()
    X_train_vectors.append(sentence_emb)

  X_train_vectors = np.array(X_train_vectors)

  if classification=='lr':
    #grid search
    param_grid_LR = {'C':[0.01, 0.05, 0.25, 0.5, 0.1]}
    clf_LR = GridSearchCV(LogisticRegression(max_iter = 500, class_weight='balanced'), param_grid=param_grid_LR)
    clf_LR.fit(X_train_vectors, y_train)
    
    model = LogisticRegression(C=clf_LR.best_params_['C'], max_iter=500, class_weight='balanced')
    model.fit(X_train_vectors, y_train)

  elif classification=='nn':
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=64, input_shape=(X_train_vectors.shape[1],),
                                activation='relu'))
    model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(units=32, activation='relu'))
    model.add(keras.layers.Dropout(rate=0.5))
    model.add(keras.layers.Dense(3, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001),
                  metrics=['accuracy'])
    
    history = model.fit(
    X_train_vectors, y_train, epochs=7, batch_size=32,
    verbose=0, validation_split=0.1, shuffle=True)

    if plot_training:
      plt.plot(history.history['loss'], label='training data')
      plt.plot(history.history['val_loss'], label='validation data')
      plt.legend(loc="upper right")
      plt.show()

      plt.plot(history.history['accuracy'], label='training data')
      plt.plot(history.history['val_accuracy'], label='validation data')
      plt.legend(loc="upper left")
      plt.show()

  return model

In [8]:
def evaluate_model(df_test, model, embedding='tfidf', classification_method='lr',
                   vectorizer=None):
  word_emb = {'tfidf':'TF-IDF', 'glove':'GLOVE WORD2VEC', 'use':'UNIVERSAL SENTENCE ENCODER'}
  if embedding != 'use':
    clean_text = TextProcessing()
    df_test_clean = clean_text.fit_transform(df_test['sentence'])
    X_test, y_test = df_test_clean, df_test['label']
  
  if embedding == 'tfidf':
    X_test_tfidf = vectorizer.transform(X_test)
    pred_classes = model.predict(X_test_tfidf)
    
  elif embedding == 'glove':
    test_sentences = X_test.values.tolist()
    tokenized_testlist = [simple_preprocess(sentence) for sentence in test_sentences]
    X_test_w2v = pd.Series(tokenized_testlist).apply(lambda x: 
                                                 compute_avg_vector(glove_w2v_model, x))
    X_test_w2v = pd.DataFrame(X_test_w2v.values.tolist(), index=X_test.index)
    pred_classes = model.predict(X_test_w2v)
    
  elif embedding == 'use':
    X_test, y_test = df_test['sentence'], df_test['label']
    X_test_vectors = []
    for r in X_test:
      emb = embed([r])
      sentence_emb = tf.reshape(emb, [-1]).numpy()
      X_test_vectors.append(sentence_emb)

    X_test_vectors = np.array(X_test_vectors)
    
    if classification_method == 'lr':
      pred_classes = model.predict(X_test_vectors)
    elif classification_method == 'nn':
      pred_classes = np.argmax(model.predict(X_test_vectors), axis=-1)
      #pred_classes = model.predict_classes(X_test_vectors, verbose=0)

  print(f'Model: {word_emb[embedding]}')
  accuracy = accuracy_score(y_test, pred_classes)
  precision = precision_score(y_test, pred_classes, average='weighted')
  recall = recall_score(y_test, pred_classes, average='weighted')
  f1 = f1_score(y_test, pred_classes, average='weighted')
  print(f"Accuracy: {accuracy}")
  print('Precision: %f' %precision)
  print('Recall: %f' %recall)
  print('F1 score: %f' %f1)

  output_df = pd.DataFrame(columns=['sentence', 'gold_label', 'predicted_label'])
  output_df = df_test
  output_df['predicted_label'] = pred_classes
  output_df.rename(columns = {'label':'gold_label'}, inplace = True) 
  print('\nOUTPUT:')
  print(output_df)
  return output_df

In [9]:
file_path1 = '/content/drive/My Drive/nlp_dataset/testing_output_tfidf.csv' 
file_path2 = '/content/drive/My Drive/nlp_dataset/testing_output_gloveW2V.csv' 
file_path3 = '/content/drive/My Drive/nlp_dataset/testing_output_USE_lr.csv' 
file_path4 = '/content/drive/My Drive/nlp_dataset/testing_output_USE_nn.csv' 
tfidf_model_file = '/content/drive/My Drive/nlp_dataset/tfidf_model.sav'
glove_model_file = '/content/drive/My Drive/nlp_dataset/glove_model.sav'
use_lr_model_file = '/content/drive/My Drive/nlp_dataset/use_lr_model.sav'
use_nn_model_file = '/content/drive/My Drive/nlp_dataset/'

In [10]:
df, df_test = load_data()
tfidf_vectorizer, tfidf_model = tfidf_word_embedding(df)

pickle.dump(tfidf_model, open(tfidf_model_file, 'wb'))

#load model if saved 
#tfidf_model = pickle.load(open(tfidf_model_file, 'rb'))
tfidf_output = evaluate_model(df_test, tfidf_model, embedding='tfidf', classification_method='lr', vectorizer=tfidf_vectorizer)
tfidf_output.to_csv(file_path1, index=False)

Model: TF-IDF
Accuracy: 0.5563689604685212
Precision: 0.570978
Recall: 0.556369
F1 score: 0.563128

OUTPUT:
                                              sentence  gold_label  predicted_label
0    even if the whole thing proves to be a creativ...           2                0
1    , but isn't quite sure how to handle " sam dee...           1                1
2    ruby's close friend gretchen ( cuz ya can't ha...           2                2
3    happy accidents is a romantic comedy filtered ...           2                2
4    the film stars thandie newton , who was robbed...           2                1
..                                                 ...         ...              ...
678  somehow , with a considerable suspension of di...           1                2
679  occasionally , the violence is slightly uncomf...           0                2
680  what is perhaps most sensational about gods an...           2                2
681  he earned that nomination with his touching pe.

In [11]:
#GLOVE 
glove_w2v_model = api.load('glove-wiki-gigaword-300')



In [12]:
df, df_test = load_data()

glove_model = glove_w2v_embedding(df)
pickle.dump(glove_model, open(glove_model_file, 'wb'))

#load model if saved 
#glove_model = pickle.load(open(glove_model_file, 'rb'))
glove_output = evaluate_model(df_test, glove_model, 'glove', 'lr')
glove_output.to_csv(file_path2, index=False)

Model: GLOVE WORD2VEC
Accuracy: 0.5592972181551976
Precision: 0.634280
Recall: 0.559297
F1 score: 0.585576

OUTPUT:
                                              sentence  gold_label  predicted_label
0    even if the whole thing proves to be a creativ...           2                2
1    , but isn't quite sure how to handle " sam dee...           1                1
2    ruby's close friend gretchen ( cuz ya can't ha...           2                2
3    happy accidents is a romantic comedy filtered ...           2                2
4    the film stars thandie newton , who was robbed...           2                2
..                                                 ...         ...              ...
678  somehow , with a considerable suspension of di...           1                2
679  occasionally , the violence is slightly uncomf...           0                0
680  what is perhaps most sensational about gods an...           2                2
681  he earned that nomination with his touc

In [13]:
#UNIVERSAL SENTENCE ENCODER

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [14]:
df, df_test = load_data()
use_lr_model = universal_sentence_encoder(df, classification='lr')
pickle.dump(use_lr_model, open(use_lr_model_file, 'wb'))

#load model if saved 
#use_lr_model = pickle.load(open(use_lr_model_file, 'rb'))
use_lr_output = evaluate_model(df_test, use_lr_model, 'use', 'lr')
use_lr_output.to_csv(file_path3,index=False)

Model: UNIVERSAL SENTENCE ENCODER
Accuracy: 0.6076134699853587
Precision: 0.660863
Recall: 0.607613
F1 score: 0.625554

OUTPUT:
                                              sentence  gold_label  predicted_label
0    even if the whole thing proves to be a creativ...           2                2
1    , but isn't quite sure how to handle " sam dee...           1                0
2    ruby's close friend gretchen ( cuz ya can't ha...           2                1
3    happy accidents is a romantic comedy filtered ...           2                2
4    the film stars thandie newton , who was robbed...           2                2
..                                                 ...         ...              ...
678  somehow , with a considerable suspension of di...           1                0
679  occasionally , the violence is slightly uncomf...           0                0
680  what is perhaps most sensational about gods an...           2                2
681  he earned that nomination w

In [15]:
#train model
df, df_test = load_data()
use_nn_model = universal_sentence_encoder(df, classification='nn', plot_training=False)
save_model(use_nn_model, use_nn_model_file)
#load model if saved 
#use_nn_model = load_model(use_nn_model_file)
use_nn_output = evaluate_model(df_test, use_nn_model, 'use', 'nn') 
use_nn_output.to_csv(file_path4,index=False)

INFO:tensorflow:Assets written to: /content/drive/My Drive/nlp_dataset/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/nlp_dataset/assets


Model: UNIVERSAL SENTENCE ENCODER
Accuracy: 0.6515373352855052
Precision: 0.628719
Recall: 0.651537
F1 score: 0.630304

OUTPUT:
                                              sentence  gold_label  predicted_label
0    even if the whole thing proves to be a creativ...           2                2
1    , but isn't quite sure how to handle " sam dee...           1                1
2    ruby's close friend gretchen ( cuz ya can't ha...           2                1
3    happy accidents is a romantic comedy filtered ...           2                2
4    the film stars thandie newton , who was robbed...           2                2
..                                                 ...         ...              ...
678  somehow , with a considerable suspension of di...           1                0
679  occasionally , the violence is slightly uncomf...           0                1
680  what is perhaps most sensational about gods an...           2                2
681  he earned that nomination w