In [1]:
import sys
import os

PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser('__file__'))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

In [10]:
from models import MODELS_PATH
from data import DATA_ROOT

In [12]:
from gensim.models import KeyedVectors


def _load_word_embedding_model(file=f'../models/fasttext/crawl-300d-2M-subword.vec', word_embedding_type="fasttext"):
    model = {}
    if file is None:
        file, *ign = embeddings.get("GLOVE_6B_300D")
    print("Loading Model")
    if word_embedding_type == "glove":
        df = pd.read_csv(file, sep=" ", quoting=3, header=None, index_col=0)
        model = {key: val.values for key, val in df.T.items()}
        print(len(model), " words loaded!")
    elif word_embedding_type == "word2vec":
        model = KeyedVectors.load_word2vec_format(file, binary=True)
    elif word_embedding_type == "fasttext":
        model = KeyedVectors.load_word2vec_format(file, binary=False)
    return model
MODEL = _load_word_embedding_model()

Loading Model


In [144]:
fasttext_nn_model = f"{MODELS_PATH}/4m-oYYqjRvqVvyu2ryjmSw_trained_models/give_FASTTEXT_CRAWL_SUB_SingleLayeredNN.pth"

In [145]:
fasttext_lr_model = f"{MODELS_PATH}/4m-oYYqjRvqVvyu2ryjmSw_trained_models/give_FASTTEXT_CRAWL_SUB_LogisticRegression.pth"

In [114]:
from src.models import LogisticRegression, SingleLayeredNN

In [115]:
import torch

In [146]:
nn_model = SingleLayeredNN(300, 300, 1)
nn_model.load_state_dict(torch.load(fasttext_nn_model))
nn_model.eval()

SingleLayeredNN(
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (sigmoid1): Sigmoid()
  (fc2): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid2): Sigmoid()
)

In [147]:
lr_model = LogisticRegression(300, 1)
lr_model.load_state_dict(torch.load(fasttext_lr_model))
lr_model.eval()

LogisticRegression(
  (linear): Linear(in_features=300, out_features=1, bias=True)
)

In [148]:
import pandas as pd


df = pd.read_csv(f'{DATA_ROOT}/4m-oYYqjRvqVvyu2ryjmSw_common_words_100_final_data.csv')

In [149]:
def _get_word_embeddings(word):
    try:
        return MODEL[word]
    except Exception as e:
        return None

In [150]:
df['Embedding'] = df["actual_words"].apply(_get_word_embeddings)

In [151]:
df.dropna(inplace=True)

In [152]:
x_data = df.loc[:, df.columns == 'Embedding']
y_data = df.loc[:, df.columns == 'give']

In [153]:
from torch.autograd import Variable


# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

# convert a df to tensor to be used in pytorch
def df_to_tensor(s_df):
    device = get_device()
    return Variable(torch.Tensor(s_df.values)).to(device)

def complex_df_to_tensor(_df):
    device = get_device()
    temp_x = []
    for index, row in _df.iterrows():
        temp_row = []
        row = row.to_dict()
        g_em = []
        for key in row.keys():
            if key != 'Embedding':
                temp_row.append(row[key])
            else:
                g_em = Variable(torch.Tensor(row[key])).to(device)
        temp_x.append(torch.cat([
            Variable(torch.Tensor(temp_row)).to(device), 
            g_em]
            ,dim=0))

    return torch.stack(temp_x, 0)

In [154]:
y_data = df_to_tensor(y_data)
x_data = complex_df_to_tensor(x_data)

In [155]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
lr_model.to(device)
nn_model.to(device)

SingleLayeredNN(
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (sigmoid1): Sigmoid()
  (fc2): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid2): Sigmoid()
)

In [156]:
from src.utils import evaluate


scores = evaluate(lr_model, x_data, y_data)

In [157]:
scores = evaluate(lr_model, x_data, y_data)
print("Accuray: ", str(scores["_accuracy"]), lr_model)
print("AUC: ", str(scores["_auc"]), lr_model)

Accuray:  0.9798568778160615 LogisticRegression(
  (linear): Linear(in_features=300, out_features=1, bias=True)
)
AUC:  0.9741754235689413 LogisticRegression(
  (linear): Linear(in_features=300, out_features=1, bias=True)
)


In [158]:
scores = evaluate(nn_model, x_data, y_data)
print("Accuray: ", str(scores["_accuracy"]), nn_model)
print("AUC: ", str(scores["_auc"]), nn_model)

Accuray:  0.9954943016167506 SingleLayeredNN(
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (sigmoid1): Sigmoid()
  (fc2): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid2): Sigmoid()
)
AUC:  0.9993244311200168 SingleLayeredNN(
  (fc1): Linear(in_features=300, out_features=300, bias=True)
  (sigmoid1): Sigmoid()
  (fc2): Linear(in_features=300, out_features=1, bias=True)
  (sigmoid2): Sigmoid()
)
