In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm

# Reading data

In [2]:
data = pd.read_csv("fb_sentiment.csv")
data.drop(data.columns[0], axis=1, inplace=True)
data.head()

Unnamed: 0,FBPost,Label
0,Drug Runners and a U.S. Senator have somethin...,O
1,"Heres a single, to add, to Kindle. Just read t...",O
2,If you tire of Non-Fiction.. Check out http://...,O
3,Ghost of Round Island is supposedly nonfiction.,O
4,Why is Barnes and Nobles version of the Kindle...,N


# Reading precomputed word2vec

(I am using https://nlp.stanford.edu/data/glove.6B.zip word2vec)

In [3]:
dim_to_word2vec_dict = {}

dims_lst = [50, 100, 200, 300]
for d in dims_lst:
    word2vec = {}
    with open(f"glove.6B/glove.6B.{d}d.txt") as file:
        for line in tqdm(file):
            lst = line.split()
            word = lst[0].lower()
            vec = np.array(lst[1:], dtype=np.float64)
            word2vec[word] = vec
    dim_to_word2vec_dict[d] = word2vec

400000it [00:04, 98629.50it/s] 
400000it [00:06, 59025.18it/s]
400000it [00:13, 30356.33it/s]
400000it [00:19, 20254.96it/s]


# Converting messages into vectors

In [4]:
y = np.array(list(data["Label"].apply(dict(zip(np.unique(data["Label"]), [[1, 0, 0], [0, 1, 0], [0, 0, 1]])).get)))
dim_to_X_dict = {}
for d in dims_lst:
    def text_to_vec(text):
        word2vec = dim_to_word2vec_dict[d]
        keys = word2vec.keys()
        vec_lst = [word2vec[word.lower()] if word in keys else np.array([np.nan] * d) for word in text.split()]
        return np.array(np.zeros(d)) if np.all(np.isnan(vec_lst)) else np.nanmean(vec_lst, axis=0)
    
    dim_to_X_dict[d] = np.array(list(data["FBPost"].apply(text_to_vec)))

# Making the train, validation, test split (0.6, 0.2, 0.2)

In [5]:
from sklearn.model_selection import train_test_split

ind_train, ind_test = train_test_split(np.arange(data.shape[0]), test_size=0.2)
ind_train, ind_valid = train_test_split(ind_train, test_size=0.25)

# Validating hyperparameters

Trying to variate layer sizes, activation functions and batch size

In [6]:
from itertools import product

fst_layer_size_lst = [64, 128, 256, 512]
snd_layer_size_lst = [16, 32, 64]
hidden_activation_function_lst = output_activation_function_lst = ["relu", "sigmoid", "softmax"]
batch_size_lst = [16, 32, 64]

parameters_lst = list(product(dims_lst,
                              fst_layer_size_lst,
                              snd_layer_size_lst,
                              hidden_activation_function_lst,
                              output_activation_function_lst,
                              batch_size_lst))

I am using AUC metric as a standard most popular (I believe) metric for classification problems

In [7]:
from tensorflow.keras import models, layers

results = {"word2vec_dim": [],
           "fst_layer_size": [],
           "snd_layer_size": [],
           "hidden_activation_function": [],
           "output_activation_functions": [],
           "batch_size": [],
           "AUC": [],
          }

for params in tqdm(parameters_lst):
    dim, fst_layer_size, snd_layer_size, hidden_activation_function, output_activation_function, batch_size = params
    X = dim_to_X_dict[dim]
    model = models.Sequential()
    model.add(layers.Dense(fst_layer_size, activation=hidden_activation_function, input_shape=(dim,)))
    model.add(layers.Dense(snd_layer_size, activation=hidden_activation_function))
    model.add(layers.Dense(3, activation=output_activation_function))

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["AUC"])
    model.fit(X[ind_train], y[ind_train], 
              epochs=20, 
              batch_size=batch_size,
              validation_data=(X[ind_valid], y[ind_valid]),
              verbose=0)
    
    for key, param in zip(results.keys(), params):
        results[key].append(param)
    
    _, valid_AUC = model.evaluate(X[ind_valid], y[ind_valid], verbose=0)
    results["AUC"].append(valid_AUC)

100%|██████████| 1296/1296 [48:46<00:00,  2.26s/it]


In [8]:
results_df = pd.DataFrame(results)

In [9]:
results_df.sort_values("AUC", ascending=False).head()

Unnamed: 0,word2vec_dim,fst_layer_size,snd_layer_size,hidden_activation_function,output_activation_functions,batch_size,AUC
519,100,256,32,relu,softmax,16,0.864237
465,100,128,64,relu,softmax,16,0.863706
547,100,256,64,relu,softmax,32,0.862844
439,100,128,32,relu,softmax,32,0.862162
548,100,256,64,relu,softmax,64,0.860906


# Fitting the model with the best parameters on train+validation and evaluating on the test

In [10]:
best_params = results_df.sort_values("AUC", ascending=False, ignore_index=True).iloc[0][:-1]
best_params

word2vec_dim                       100
fst_layer_size                     256
snd_layer_size                      32
hidden_activation_function        relu
output_activation_functions    softmax
batch_size                          16
Name: 0, dtype: object

In [11]:
dim, fst_layer_size, snd_layer_size, hidden_activation_function, output_activation_function, batch_size = best_params
X = dim_to_X_dict[dim]
model = models.Sequential()
model.add(layers.Dense(fst_layer_size, activation=hidden_activation_function, input_shape=(dim,)))
model.add(layers.Dense(snd_layer_size, activation=hidden_activation_function))
model.add(layers.Dense(3, activation=output_activation_function))

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["AUC"])
ind = np.concatenate((ind_train, ind_valid))
model.fit(X[ind], y[ind], 
          epochs=20, 
          batch_size=batch_size,
          verbose=0)

_, test_AUC = model.evaluate(X[ind_test], y[ind_test], verbose=0)

# Final result:

In [12]:
print("Test AUC:", test_AUC)

Test AUC: 0.8507624864578247
