In [5]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle

import numpy as np
import pandas as pd

from importlib import reload

import scipy.sparse

In [7]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


## Load the data

In [2]:
main_dir = '/home/yulia/W266-Final-Project/'

In [3]:
vecdata_path = main_dir+'Classification/data/vectorized/ethnicity'

In [8]:
train_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'train_vec.npz'))
val_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'val_vec.npz'))
test_vec = scipy.sparse.load_npz(os.path.join(vecdata_path, 'test_vec.npz'))

In [9]:
train_vec.shape, val_vec.shape, test_vec.shape

((188520, 10000), (62840, 10000), (62840, 10000))

In [11]:
outdata_path = main_dir+'Classification/data/splits/ethnicity'

In [13]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [14]:
sum(train_target), sum(val_target), sum(test_target)

(94260, 31420, 31420)

### Model: Multi-Layer Perceptron (Vanilla NN)

In [15]:
logs_base_dir = main_dir + 'Classification/logs/Ethnicity'

In [16]:
ngram_model_params = {
    'model_type': 'mlp',
    'learning_rate': 0.001,
    'epochs': 1000,
    'batch_size': 128,
    'layers': 2,
    'units': 64,
    'dropout_rate': 0.2
}

In [17]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **ngram_model_params)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Colocations handled automatically by placer.
Train on 188520 samples, validate on 62840 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/1000
 - 29s - loss: 0.5523 - acc: 0.7160 - val_loss: 0.5168 - val_acc: 0.7404
Epoch 2/1000
 - 28s - loss: 0.5099 - acc: 0.7431 - val_loss: 0.5090 - val_acc: 0.7438
Epoch 3/1000
 - 29s - loss: 0.4955 - acc: 0.7519 - val_loss: 0.5051 - val_acc: 0.7471
Epoch 4/1000
 - 28s - loss: 0.4849 - acc: 0.7591 - val_loss: 0.5030 - val_acc: 0.7476
Epoch 5/1000
 - 28s - loss: 0.4738 - acc: 0.7673 - val_loss: 0.5004 - val_acc: 0.7519
Epoch 6/1000
 - 28s - loss: 0.4642 - acc: 0.7745 - val_loss: 0.4983 - val_acc: 0.7549
Epoch 7/1000
 - 29s - loss: 0.4521 - acc: 0.7823 - val_loss: 0.4996 - val_acc: 0.7549
Epoch 8/1000
 - 25s - loss: 0.4394 - acc: 0.7897 - val_loss: 0.4981 - val_acc: 0.7561
Epoch 9/1000
 - 28s - loss

### Score/save test and validate

In [18]:
model_dir = main_dir + 'Classification/model/ethnicity/'
model.save(model_dir+'mlp_ngram_model.h5')

In [20]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.5040547801536182, 0.7515913]

In [22]:
probs_path = main_dir+'Classification/data/probs/ethnicity/'

In [23]:
np.save(probs_path+'val_pred_probs_mlp_ngram', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_mlp_ngram', test_pred_probs, allow_pickle=True, fix_imports=True)