In [1]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle

import numpy as np
import pandas as pd

from importlib import reload

import scipy.sparse

In [3]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


## Load the data

In [6]:
main_dir = '/home/yulia/W266-Final-Project/'

In [7]:
vecdata_path = main_dir+'Classification/data/vectorized/party'

In [8]:
train_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'train_vec.npz'))
val_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'val_vec.npz'))
test_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'test_vec.npz'))

In [9]:
train_vec.shape, val_vec.shape, test_vec.shape

((303459, 10000), (101153, 10000), (101154, 10000))

In [10]:
outdata_path = main_dir+'Classification/data/splits/party'

In [11]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [12]:
sum(train_target), sum(val_target), sum(test_target)

(161472, 53824, 53825)

### Model: Multi-Layer Perceptron (Vanilla NN)

In [14]:
logs_base_dir = main_dir + 'Classification/logs/Party'

In [15]:
ngram_model_params = {
    'model_type': 'mlp',
    'learning_rate': 0.001,
    'epochs': 1000,
    'batch_size': 128,
    'layers': 2,
    'units': 64,
    'dropout_rate': 0.2
}

In [16]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **ngram_model_params)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Colocations handled automatically by placer.
Train on 303459 samples, validate on 101153 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/1000
 - 66s - loss: 0.6007 - acc: 0.6563 - val_loss: 0.5788 - val_acc: 0.6717
Epoch 2/1000
 - 61s - loss: 0.5644 - acc: 0.6831 - val_loss: 0.5707 - val_acc: 0.6779
Epoch 3/1000
 - 61s - loss: 0.5527 - acc: 0.6948 - val_loss: 0.5687 - val_acc: 0.6808
Epoch 4/1000
 - 62s - loss: 0.5424 - acc: 0.7034 - val_loss: 0.5668 - val_acc: 0.6830
Epoch 5/1000
 - 62s - loss: 0.5330 - acc: 0.7128 - val_loss: 0.5644 - val_acc: 0.6856
Epoch 6/1000
 - 62s - loss: 0.5219 - acc: 0.7225 - val_loss: 0.5645 - val_acc: 0.6880
Epoch 7/1000
 - 63s - loss: 0.5113 - acc: 0.7323 - val_loss: 0.5644 - val_acc: 0.6895
Validation accuracy: 0.689509928226471, loss: 0.5644221771604024


### Score/save test and validate

In [17]:
model_dir = main_dir + 'Classification/model/party/'
model.save(model_dir+'mlp_ngram_model.h5')

In [18]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.5577042196008953, 0.69091684]

In [19]:
probs_path = main_dir+'Classification/data/probs/party/'

In [20]:
np.save(probs_path+'val_pred_probs_mlp_ngram', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_mlp_ngram', test_pred_probs, allow_pickle=True, fix_imports=True)