In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import random
import tensorflow as tf
from pickle import dump
from pickle import load
%matplotlib inline

In [2]:
# Setting seed
global_seed = 42
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)


In [3]:
train = pd.read_csv("train_preprocessed", index_col = 0)
test = pd.read_csv("test_preprocessed", index_col = 0)

In [4]:
train.dropna(subset = ["preprocessed tweets"], inplace = True)
test.dropna(subset = ["preprocessed tweets"], inplace = True)

# TF-IDF vectorization of tweets

In [5]:
train_tweets = train["preprocessed tweets"]
test_tweets = test['preprocessed tweets']

In [6]:
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
train_tfidf = tfidfvectorizer.fit_transform(train_tweets)
test_tfidf = tfidfvectorizer.transform(test_tweets)

In [7]:
tfidf_tokens = tfidfvectorizer.get_feature_names()
idf_scores = tfidfvectorizer.idf_

In [8]:
len(tfidf_tokens)

54649

In [9]:
len(idf_scores)

54649

Too many features. Let us try to give our own vocab

In [10]:
token_tfidf = dict(zip(tfidf_tokens, idf_scores))

In [11]:
token_tfidf_sorted = dict(sorted(token_tfidf.items(), key = lambda item :item[1]))

In [12]:
# After experimenting, came up with value 10 for idf. all the tokens who ahve less than 10 idf have been considered
vocab = [k for k,v in token_tfidf.items() if v<10]

In [13]:
len(vocab)

10773

This reduces vocab size from 54649 to 10773. Let us pass this vacab and train tfidf vectorizer again

In [14]:
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', vocabulary = vocab)
train_tfidf = tfidfvectorizer.fit_transform(train_tweets)
test_tfidf = tfidfvectorizer.transform(test_tweets)

In [15]:
tfidf_tokens = tfidfvectorizer.get_feature_names()
idf_scores = tfidfvectorizer.idf_

In [16]:
len(tfidf_tokens)

10773

In [17]:
train_tfidf_df = pd.DataFrame(data = train_tfidf.toarray(),columns = tfidf_tokens)
test_tfidf_df = pd.DataFrame(data = test_tfidf.toarray(),columns = tfidf_tokens)

In [18]:
train_tfidf_df.shape

(41147, 10773)

In [19]:
test_tfidf_df.shape

(3798, 10773)

# Training and Evaluation


## Encode target variable

In [20]:
train_labels = train["Sentiment"]
test_labels = test["Sentiment"]

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_labels)
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)
class_label_mapping = dict(set(zip(y_train, train_labels.values)))
print(class_label_mapping)
dump(le, open('le.pkl', 'wb'))

In [34]:
le = load(open('le.pkl', 'rb'))
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)
class_label_mapping = dict(set(zip(y_train, train_labels.values)))
class_label_mapping

{0: 'Extremely Negative',
 2: 'Negative',
 3: 'Neutral',
 1: 'Extremely Positive',
 4: 'Positive'}

# Grid search on MultiNB

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}
clf=GridSearchCV(model,param,scoring="f1_weighted",cv=3,return_train_score=True, refit = False)  

In [53]:
clf.fit(train_tfidf_df, y_train)

GridSearchCV(cv=3, estimator=MultinomialNB(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.1, 1, 10, 100,
                                   1000]},
             refit=False, return_train_score=True, scoring='f1_weighted')

In [58]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,2.811205,0.752775,0.472056,0.01736,1e-05,{'alpha': 1e-05},0.400217,0.398891,0.391987,0.397032,0.003608,4,0.749449,0.747292,0.744094,0.746945,0.0022
1,2.39351,0.154933,0.505617,0.011847,0.0001,{'alpha': 0.0001},0.401845,0.399718,0.393197,0.398254,0.003679,3,0.749486,0.747212,0.744132,0.746943,0.002194
2,2.238762,0.089779,0.487143,0.022339,0.001,{'alpha': 0.001},0.403935,0.400957,0.39592,0.400271,0.003308,2,0.749066,0.74683,0.743529,0.746475,0.002274
3,2.145165,0.023822,0.44475,0.00529,0.1,{'alpha': 0.1},0.4249,0.425449,0.418157,0.422835,0.003316,1,0.730962,0.727843,0.725528,0.728111,0.002227
4,2.100548,0.017256,0.446707,0.016521,1.0,{'alpha': 1},0.38665,0.38458,0.383024,0.384751,0.001485,5,0.593572,0.589242,0.584297,0.589037,0.003789
5,2.123218,0.012325,0.411799,0.003411,10.0,{'alpha': 10},0.206175,0.206468,0.195287,0.202643,0.005203,6,0.249958,0.243709,0.24163,0.245099,0.003539
6,2.105633,0.022406,0.434213,0.010733,100.0,{'alpha': 100},0.131261,0.129323,0.126809,0.129131,0.001823,7,0.139896,0.130742,0.137241,0.13596,0.003846
7,2.132728,0.007227,0.451078,0.014092,1000.0,{'alpha': 1000},0.120603,0.120603,0.120676,0.120628,3.4e-05,8,0.12064,0.12064,0.120603,0.120628,1.7e-05


In [61]:
clf.best_params_

{'alpha': 0.1}

In [62]:
model = MultinomialNB(alpha = 0.1)
model.fit(train_tfidf_df, y_train)

MultinomialNB(alpha=0.1)

In [66]:
train_pred_label = model.predict(train_tfidf_df)

In [67]:
train_pred_prob = model.predict_proba(train_tfidf_df)

In [70]:
test_pred_label = model.predict(test_tfidf_df)
test_predict_prob = model.predict_proba(test_tfidf_df)

In [78]:
class_label_mapping

{4: 'Positive',
 2: 'Negative',
 1: 'Extremely Positive',
 0: 'Extremely Negative',
 3: 'Neutral'}

In [42]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    #df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report


In [85]:
print("Performance on training data:")
get_classification_report(y_train, train_pred_label)

Performance on training data:


Unnamed: 0,precision,recall,f1-score,support
0,0.839607,0.561576,0.673008,5481.0
1,0.811025,0.555254,0.659199,6624.0
2,0.625857,0.708783,0.664744,9917.0
3,0.801297,0.641828,0.712751,7703.0
4,0.585526,0.796883,0.675047,11422.0
accuracy,0.67638,0.67638,0.67638,0.67638
macro avg,0.732662,0.652865,0.67695,41147.0
weighted avg,0.705787,0.67638,0.676799,41147.0


In [93]:
print("Performance on testing data:")
get_classification_report(y_test, test_pred_label)

Performance on testing data:


Unnamed: 0,precision,recall,f1-score,support
0,0.593361,0.241554,0.343337,592.0
1,0.62963,0.255426,0.36342,599.0
2,0.413425,0.520653,0.460884,1041.0
3,0.553398,0.368336,0.442289,619.0
4,0.367065,0.616684,0.460205,947.0
accuracy,0.434439,0.434439,0.434439,0.434439
macro avg,0.511376,0.400531,0.414027,3798.0
weighted avg,0.486824,0.434439,0.423991,3798.0


We are getting better precision than recall. We are getting more correct positive predictions out of total positive predictions compared to actual positives.

# Gris search on linear SVM

In [94]:
from sklearn.linear_model import SGDClassifier
model=SGDClassifier()
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
      'penalty' : ['l1', 'l2']}
clf=GridSearchCV(model,param,scoring="f1_weighted",cv=3,return_train_score=True, refit = False)  

In [95]:
clf.fit(train_tfidf_df, y_train)

GridSearchCV(cv=3, estimator=SGDClassifier(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']},
             refit=False, return_train_score=True, scoring='f1_weighted')

In [96]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,330.157895,4.948763,0.288773,0.039736,1e-05,l1,"{'alpha': 1e-05, 'penalty': 'l1'}",0.532542,0.543168,0.547714,0.541141,0.006357,1,0.873767,0.869968,0.863824,0.869186,0.004097
1,66.167849,1.357274,0.252922,0.006545,1e-05,l2,"{'alpha': 1e-05, 'penalty': 'l2'}",0.503993,0.509509,0.517103,0.510201,0.005374,2,0.849071,0.843117,0.845226,0.845805,0.002465
2,52.651404,1.503762,0.249021,0.007806,0.0001,l1,"{'alpha': 0.0001, 'penalty': 'l1'}",0.442518,0.410244,0.469019,0.440593,0.024033,7,0.505748,0.443861,0.490732,0.480114,0.026357
3,26.226131,0.126655,0.268014,0.026515,0.0001,l2,"{'alpha': 0.0001, 'penalty': 'l2'}",0.486683,0.490249,0.523108,0.500013,0.016395,3,0.738269,0.733148,0.752305,0.741241,0.008099
4,31.28163,0.083025,0.286656,0.003494,0.001,l1,"{'alpha': 0.001, 'penalty': 'l1'}",0.138867,0.194991,0.164798,0.166218,0.022935,9,0.145654,0.192695,0.170808,0.169719,0.01922
5,14.882375,0.137327,0.26082,0.01473,0.001,l2,"{'alpha': 0.001, 'penalty': 'l2'}",0.434879,0.433854,0.48321,0.450648,0.023029,6,0.651866,0.625992,0.676898,0.651585,0.020783
6,22.474956,0.079038,0.248351,0.006046,0.1,l1,"{'alpha': 0.1, 'penalty': 'l1'}",0.120603,0.120603,0.059017,0.100075,0.029032,14,0.12064,0.12064,0.059052,0.10011,0.029033
7,11.058739,0.190379,0.257671,0.006656,0.1,l2,"{'alpha': 0.1, 'penalty': 'l2'}",0.463253,0.442841,0.49851,0.468201,0.022995,5,0.682521,0.639947,0.681193,0.667887,0.019764
8,23.621481,0.4539,0.253434,0.005902,1.0,l1,"{'alpha': 1, 'penalty': 'l1'}",0.031314,0.031314,0.044649,0.035759,0.006286,16,0.031317,0.031317,0.044642,0.035759,0.006282
9,11.364585,0.105877,0.264399,0.015683,1.0,l2,"{'alpha': 1, 'penalty': 'l2'}",0.463019,0.498457,0.496237,0.485904,0.016208,4,0.69689,0.713306,0.691258,0.700485,0.009353


In [99]:
#{'alpha': 1e-05, 'penalty': 'l1'}
clf.best_params_

{'alpha': 1e-05, 'penalty': 'l1'}

In [100]:
model = SGDClassifier(loss = 'hinge', penalty = 'l1', alpha = 1e-5)
model.fit(train_tfidf_df, y_train)

SGDClassifier(alpha=1e-05, penalty='l1')

In [102]:
train_pred_label = model.predict(train_tfidf_df)
test_pred_label = model.predict(test_tfidf_df)


In [103]:
print("Performance on training data:")
get_classification_report(y_train, train_pred_label)

Performance on training data:


Unnamed: 0,precision,recall,f1-score,support
0,0.798236,0.891443,0.842269,5481.0
1,0.846747,0.844958,0.845852,6624.0
2,0.831199,0.687204,0.752374,9917.0
3,0.837015,0.885369,0.860514,7703.0
4,0.767172,0.810629,0.788302,11422.0
accuracy,0.811165,0.811165,0.811165,0.811165
macro avg,0.816074,0.823921,0.817862,41147.0
weighted avg,0.812627,0.811165,0.809614,41147.0


In [104]:
print("Performance on testing data:")
get_classification_report(y_test, test_pred_label)

Performance on testing data:


Unnamed: 0,precision,recall,f1-score,support
0,0.609756,0.633446,0.621375,592.0
1,0.663732,0.629382,0.646101,599.0
2,0.523227,0.411143,0.460463,1041.0
3,0.629055,0.720517,0.671687,619.0
4,0.480699,0.55227,0.514005,947.0
accuracy,0.565824,0.565824,0.565824,0.565824
macro avg,0.581294,0.589352,0.582726,3798.0
weighted avg,0.565518,0.565824,0.562598,3798.0


# **Getting 300 dimensional GLoVe embedding vectors for words**

In [42]:
embeddings_index = dict()
f = open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [46]:
embeddings_index['the'].shape

(300,)

In [51]:
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english', vocabulary = vocab)
train_tfidf = tfidfvectorizer.fit_transform(train_tweets)
test_tfidf = tfidfvectorizer.transform(test_tweets)

tfidf_tokens = tfidfvectorizer.get_feature_names()
idf_scores = tfidfvectorizer.idf_
# creating a dic with keys as tokens and values and idf values
idf_dict = dict(zip(tfidfvectorizer.get_feature_names(), tfidfvectorizer.idf_))

In [53]:
len(idf_dict)

10773

In [54]:
def tfidf_weighted_GloVe(sentences, embeddings_dict, idf_dict):
    
    """
    Given a list of sentences, the function returns
    a 2d array where each row is tf-idf weighted GLoVe
    vector of each sentence.
    args:
    sentences: list of sentences
    ft: a fasttext model
    idf_dict: a dictionary with tokens as keys and idf values and values
    
    """
    
    from collections import Counter
    sentences_tfidf_GLoVe = list()
    vocab = list(idf_dict.keys())
    for sent in sentences:
        words = sent.split(' ')
        tf_dict = dict(Counter(words))
        tf_idf_sum = 0
        doc_vector = np.zeros(300)
        for word in words:
            if word in vocab:
                glove = embeddings_dict.get(word,np.zeros(300))
                idf = idf_dict.get(word,0)
                tf = tf_dict.get(word)
                tf_idf = tf*idf
                weighted_word_vec = glove*tf_idf
                tf_idf_sum = tf_idf_sum + tf_idf
                doc_vector = doc_vector + weighted_word_vec
    
        if tf_idf_sum != 0:
            doc_vector = doc_vector/tf_idf_sum
        
        sentences_tfidf_GLoVe.append(doc_vector)
    
    return np.vstack(sentences_tfidf_GLoVe)

    

In [59]:
train_tfidf_glove = tfidf_weighted_GloVe(list(train_tweets), embeddings_index, idf_dict)

In [62]:
test_tfidf_glove = tfidf_weighted_GloVe(list(test_tweets), embeddings_index, idf_dict)

In [63]:
np.save("train_tfidf_glove.npy", train_tfidf_glove)
np.save("test_tfidf_glove.npy", test_tfidf_glove)

In [76]:
train_tfidf_glove_df = pd.DataFrame(train_tfidf_glove)
test_tfidf_glove_df = pd.DataFrame(test_tfidf_glove)

In [74]:
y_train.shape

(41147,)

In [75]:
train_tfidf_glove.shape

(41147, 300)

## Hyperparameter tune MultiNB with tfidf-glove

In [78]:
from sklearn.linear_model import SGDClassifier
model=SGDClassifier()
param={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
      'penalty' : ['l1', 'l2']}
clf=GridSearchCV(model,param,scoring="f1_weighted",cv=3,return_train_score=True, refit = False)  

clf.fit(train_tfidf_glove_df, y_train)

GridSearchCV(cv=3, estimator=SGDClassifier(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']},
             refit=False, return_train_score=True, scoring='f1_weighted')

In [79]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,13.82565,1.846324,0.042999,0.042971,1e-05,l1,"{'alpha': 1e-05, 'penalty': 'l1'}",0.37639,0.36172,0.370433,0.369514,0.006024,4,0.406212,0.391137,0.38413,0.393827,0.009213
1,6.895574,0.222793,0.011673,0.000479,1e-05,l2,"{'alpha': 1e-05, 'penalty': 'l2'}",0.403707,0.395707,0.341041,0.380152,0.027847,3,0.435282,0.428028,0.371362,0.411557,0.028576
2,6.223527,0.044623,0.012788,0.000559,0.0001,l1,"{'alpha': 0.0001, 'penalty': 'l1'}",0.397895,0.355508,0.398216,0.383873,0.020058,2,0.403314,0.377691,0.422863,0.401289,0.018497
3,2.481956,0.087427,0.011996,2e-06,0.0001,l2,"{'alpha': 0.0001, 'penalty': 'l2'}",0.42917,0.369878,0.370144,0.389731,0.027888,1,0.45039,0.400126,0.398719,0.416412,0.024033
4,2.389974,0.046561,0.012346,0.000464,0.001,l1,"{'alpha': 0.001, 'penalty': 'l1'}",0.339824,0.254035,0.256388,0.283416,0.039898,8,0.351778,0.267959,0.240128,0.286622,0.047453
5,0.968174,0.010895,0.01236,0.000607,0.001,l2,"{'alpha': 0.001, 'penalty': 'l2'}",0.327041,0.3321,0.375663,0.344935,0.021826,7,0.345787,0.3498,0.397295,0.364294,0.023393
6,1.465646,0.001642,0.011539,1.2e-05,0.1,l1,"{'alpha': 0.1, 'penalty': 'l1'}",0.044642,0.093626,0.044649,0.060972,0.02309,14,0.044646,0.093607,0.044642,0.060965,0.023081
7,0.524729,0.001691,0.012338,0.000465,0.1,l2,"{'alpha': 0.1, 'penalty': 'l2'}",0.388702,0.333778,0.362715,0.361732,0.022433,6,0.405259,0.360959,0.386166,0.384128,0.018143
8,1.408193,0.003459,0.01152,0.000411,1.0,l1,"{'alpha': 1, 'penalty': 'l1'}",0.031314,0.044642,0.031319,0.035759,0.006282,16,0.031317,0.044646,0.031314,0.035759,0.006284
9,0.486449,0.02196,0.012342,0.000467,1.0,l2,"{'alpha': 1, 'penalty': 'l2'}",0.355557,0.366128,0.376892,0.366192,0.00871,5,0.365854,0.385831,0.403215,0.384967,0.015265


# Deep learning on tfidf vectors

In [35]:
inp_shape = (train_tfidf_df.shape[1],)

In [36]:
import os
import pickle
import tensorflow as tf
import random
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, concatenate, MaxPool1D, Dropout, GlobalMaxPool1D, BatchNormalization, Input, Flatten, Conv1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l2, l1, l1_l2
#from tensorflow.keras.metrics import F1Score
from tensorflow_addons.metrics import F1Score

In [46]:
#one hot encoding target
y_train_enc = to_categorical(y_train)
y_test_enc = to_categorical(y_test)
metric = F1Score(num_classes = 5, average = "weighted")

input_ = Input(shape=inp_shape) 
dense1 = Dense(units = 50, activation = 'relu', kernel_regularizer = l1(0.001))(input_)
#drop1 = Dropout(0.5)(dense1)
dense2 = Dense(units = 25, activation = 'relu', kernel_regularizer = l1(0.001))(dense1)
#drop2 = Dropout(0.5)(dense2)
dense3 = Dense(units = 15, activation = 'relu', kernel_regularizer = l1(0.001))(dense2)
norm1 = BatchNormalization()(dense3)
dense4 = Dense(units = 10, activation = 'relu', kernel_regularizer = l1(0.001))(norm1)
output = Dense(units = 5, activation = 'softmax')(dense4)
model0 = Model(inputs = input_, outputs = output)

In [47]:
model0.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = metric)

In [48]:
model0.fit(train_tfidf_df, y_train_enc, epochs = 10, batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d019ccd6d8>

In [49]:
pred = model0.predict(test_tfidf_df)
label_pred = np.array(tf.argmax(pred, axis = 1))
print("Performance on testing data:")
get_classification_report(y_test, label_pred)

Performance on testing data:


Unnamed: 0,precision,recall,f1-score,support
0,0.711504,0.679054,0.694901,592.0
1,0.886628,0.509182,0.646872,599.0
2,0.567424,0.7195,0.634477,1041.0
3,0.704202,0.676898,0.69028,619.0
4,0.616016,0.63358,0.624675,947.0
accuracy,0.651659,0.651659,0.651659,0.651659
macro avg,0.697155,0.643643,0.658241,3798.0
weighted avg,0.674633,0.651659,0.652501,3798.0


In [50]:
train_pred = model0.predict(train_tfidf_df)
train_label_pred = np.array(tf.argmax(train_pred, axis = 1))
print("Performance on training data:")
get_classification_report(y_train, train_label_pred)

Performance on training data:


Unnamed: 0,precision,recall,f1-score,support
0,0.820725,0.851122,0.835647,5481.0
1,0.912733,0.656854,0.763936,6624.0
2,0.686184,0.837854,0.754472,9917.0
3,0.874022,0.812411,0.842091,7703.0
4,0.750853,0.751182,0.751018,11422.0
accuracy,0.781661,0.781661,0.781661,0.781661
macro avg,0.808904,0.781885,0.789433,41147.0
weighted avg,0.793692,0.781661,0.782253,41147.0


In [51]:
from prettytable import PrettyTable
myTable = PrettyTable(["Feature set", "Model", "train weighted f1-score", "test weighted f1-score"])
myTable.add_row(["tf-idf", "MultiNB", "0.67", "0.42"])
myTable.add_row(["tf-idf", "SVC", "0.80", "0.56"])
myTable.add_row(["tf-idf", "MLP", "0.78", "0.65"])
print('Summary of model performances on tf-idf vecotrs:')
print(myTable)

Summary of model performances on tf-idf vecotrs:
+-------------+---------+-------------------------+------------------------+
| Feature set |  Model  | train weighted f1-score | test weighted f1-score |
+-------------+---------+-------------------------+------------------------+
|    tf-idf   | MultiNB |           0.67          |          0.42          |
|    tf-idf   |   SVC   |           0.80          |          0.56          |
|    tf-idf   |   MLP   |           0.78          |          0.65          |
+-------------+---------+-------------------------+------------------------+
