In [1]:
import pickle
import collections as col
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
%matplotlib inline
target_cols = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 
    'insult', 'identity_hate'
]

import torch
import torch.nn
from torch.autograd import Variable
import torch.optim

In [2]:
import sys
sys.path += ["/home/zphang/nyu/ML/project/ml_project/"]

In [3]:
import imp
import src.deep
imp.reload(src.deep)

<module 'src.deep' from '/home/zphang/nyu/ML/project/ml_project/src/deep.py'>

# Data

In [4]:
with open("/home/zphang/data/ml_proj/data2/X_train_os.pkl", "rb") as f:
    X_train = pickle.load(f)
with open("/home/zphang/data/ml_proj/data2/y_train_os.pkl", "rb") as f:
    y_train = pickle.load(f)
with open("/home/zphang/data/ml_proj/data2/X_val.pkl", "rb") as f:
    X_val = pickle.load(f)
with open("/home/zphang/data/ml_proj/data2/y_val.pkl", "rb") as f:
    y_val = pickle.load(f)
with open("/home/zphang/data/ml_proj/data2/X_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("/home/zphang/data/ml_proj/data2/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

In [5]:
y_train = y_train[y_val.columns].fillna(0)

# Code

In [6]:
train_corpus = src.deep.CorpusReader(
    x_df=X_train,
    y_df=y_train,
)
val_corpus = src.deep.CorpusReader(
    x_df=X_val,
    y_df=y_val,
)
test_corpus = src.deep.CorpusReader(
    x_df=X_test,
    y_df=y_test,
)

In [7]:
full_word_srs = src.deep.get_word_srs(X_train["comment_text"])

In [8]:
LOG_STEP = -1
BATCH_SIZE = 256
device = src.deep.get_device_func()

param_dict_ls = [
    {
        "top_k_words": 50000,
        "hidden_size": 128,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.6B.100d.txt",
    },
    {
        "top_k_words": 50000,
        "hidden_size": 256,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.twitter.27B.100d.txt",
    },
    {
        "top_k_words": 50000,
        "hidden_size": 512,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.twitter.27B.100d.txt",
    },
    {
        "top_k_words": 50000,
        "hidden_size": 128,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.6B.100d.txt",
    },
    {
        "top_k_words": 50000,
        "hidden_size": 256,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.twitter.27B.100d.txt",
    },
    {
        "top_k_words": 50000,
        "hidden_size": 512,
        "dropout_prob": 0.2,
        "learning_rate": 0.01,
        "n_epochs": 3,
        "glove_path": "/home/zphang/data/vector_cache/glove.twitter.27B.100d.txt",
    },
]

In [9]:
model_store = []
val_auc_ls = []

In [10]:
imp.reload(src.deep)

<module 'src.deep' from '/home/zphang/nyu/ML/project/ml_project/src/deep.py'>

In [11]:
for i, param_dict in enumerate(param_dict_ls):
    print(f"Model {i}")
    model, word_embeddings, dictionary, val_loss_log = \
        src.deep.train_model(
            param_dict=param_dict, 
            device=device, 
            full_word_srs=full_word_srs,
            train_corpus=train_corpus, val_corpus=val_corpus,
            batch_size=BATCH_SIZE, log_step=LOG_STEP,
        )
    model_store.append(
        (model, word_embeddings, dictionary, val_loss_log)
    )
    val_prob = src.deep.inference(
        corpus=val_corpus,
        model=model,
        word_embeddings=word_embeddings,
        max_batch_size=BATCH_SIZE,
        dictionary=dictionary,
        device=device,
    )
    val_auc = np.mean(src.deep.get_auc(y_val, val_prob))
    print(f"Val AUC: {val_auc}")
    val_auc_ls.append(val_auc)
    print("")

Model 0
EPOCH 0: 0.09290471070993475, 2018-05-02 03:46:24.225638
EPOCH 1: 0.0842719744705796, 2018-05-02 03:48:04.985622
EPOCH 2: 0.08480384096350577, 2018-05-02 03:49:36.172968
Val AUC: 0.9425365676687183

Model 1
EPOCH 0: 0.08722765982419975, 2018-05-02 03:51:27.130310
EPOCH 1: 0.0930436189365378, 2018-05-02 03:53:09.187963
EPOCH 2: 0.08825137105902255, 2018-05-02 03:54:41.922794
Val AUC: 0.9346616089825189

Model 2
EPOCH 0: 0.08789740821469556, 2018-05-02 03:56:43.969644
EPOCH 1: 0.0915111405061153, 2018-05-02 03:58:37.818996
EPOCH 2: 0.09746758881143905, 2018-05-02 04:00:22.478743
Val AUC: 0.9314097399574833

Model 3
EPOCH 0: 0.09025667597663864, 2018-05-02 04:02:10.998766
EPOCH 1: 0.08597316993288978, 2018-05-02 04:03:51.558688
EPOCH 2: 0.09338220154061497, 2018-05-02 04:05:24.983167
Val AUC: 0.9403123525785649

Model 4
EPOCH 0: 0.08967170825402838, 2018-05-02 04:07:17.041671
EPOCH 1: 0.09491756246705069, 2018-05-02 04:08:59.833029
EPOCH 2: 0.09801841695348411, 2018-05-02 04:10:32

In [12]:
val_prob = src.deep.inference(
    corpus=val_corpus,
    encoder=model,
    word_embeddings=word_embeddings,
    max_batch_size=BATCH_SIZE,
    dictionary=dictionary,
    device=device,
)

TypeError: inference() got an unexpected keyword argument 'encoder'

In [None]:
src.deep.plot_roc(y_val, val_prob, target_cols)