In [2]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle
import gc
import re

import numpy as np
import pandas as pd

from importlib import reload

In [4]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


In [5]:
import tensorflow as tf
tf.__version__

'1.13.0-rc2'

## Load the data

In [6]:
main_dir = '/home/yulia/W266-Final-Project/'

In [7]:
token_path = main_dir+'Classification/data/tokenized/ethnicity'

In [8]:
train_vec = np.load(os.path.join(token_path, 'train_vec.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec.npy'))

In [9]:
word_index = pickle.load(open(os.path.join(token_path, 'word_index.p'), 'rb'))

In [10]:
len(train_vec), len(val_vec), len(test_vec)

(188520, 62840, 62840)

In [14]:
outdata_path = main_dir+'Classification/data/splits/ethnicity'

In [15]:
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [16]:
sum(train_target), sum(val_target), sum(test_target)

(94260, 31420, 31420)

## CNN model - Full Speech + Glove Embeddings

In [11]:
logs_base_dir = main_dir + 'Classification/logs/ethnicity'

In [12]:
cnn_model_params = {
    'model_type': 'cnn',
    'word_index': word_index,
    'learning_rate': 0.001,
    'layers': 2,
    'epochs': 1000,
    'batch_size': 128,
    'filters': 64,
    'dropout_rate': 0.2,
    'embedding_dim': 200,
    'kernel_size': 5,
    'pool_size': 1,
    'max_num_words': 20000,
    'use_pretrained_embedding': True,
    'is_embedding_trainable': True,
    'glove_dir': main_dir + 'data'
}

In [17]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 21.8 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 188520 samples, validate on 62840 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1000
 - 1276s - loss: 0.6008 - acc: 0.6695 - val_loss: 0.5541 - val_acc: 0.7139
Epoch 2/1000
 - 1269s - loss: 0.5469 - acc: 0.7198 - val_loss: 0.5323 - val_acc: 0.7311
Epoch 3/1000
 - 1351s - loss: 0.5196 - acc: 0.7403 - val_loss: 0.5269 - val_acc: 0.7372
Epoch 4/1000
 - 1298s - loss: 0.4968 - acc: 0.7562 - val_loss: 0.5195 - val_acc: 0.7392
Epoch 5/1000
 - 1297s - loss: 0.4762 - acc: 0.7703 - val_loss: 0.5263 - val_acc: 0.7348
Epoch 6/1000
 - 1296s - loss: 0.4543 - a

### Score/save test and validate

In [18]:
model_dir = main_dir + 'Classification/model/ethnicity/'
model.save(model_dir+'cnn_model.h5')

In [19]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.5273021834849704, 0.73658496]

In [20]:
probs_path = main_dir+'Classification/data/probs/ethnicity/'

In [21]:
np.save(probs_path+'val_pred_probs_cnn', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn', test_pred_probs, allow_pickle=True, fix_imports=True)

## CNN - chunk

### Load preprocessed data

In [22]:
outdata_path = main_dir+'Classification/data/splits/ethnicity'

In [23]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)

In [24]:
chunk_len=50

In [25]:
reload(preprocess_data)
start_time = time.time()
train_chunk, train_ids_chunk, train_target_chunk = preprocess_data.split_speech_to_chunks(train, train_ids, train_target, max_len=chunk_len)
val_chunk, val_ids_chunk, val_target_chunk = preprocess_data.split_speech_to_chunks(val, val_ids, val_target, max_len=chunk_len)
test_chunk, test_ids_chunk, test_target_chunk = preprocess_data.split_speech_to_chunks(test, test_ids, test_target, max_len=chunk_len)
print("\nIt took {:.1f} seconds to create the dictionary".format(time.time()-start_time))

Original data has 188520 speeches
It was split into 1507038 chunks
Checks on ids and target 1507038 1507038
Original target mean 0.5
New target mean 0.49381103860685666
Original data has 62840 speeches
It was split into 503395 chunks
Checks on ids and target 503395 503395
Original target mean 0.5
New target mean 0.4916020222687949
Original data has 62840 speeches
It was split into 507051 chunks
Checks on ids and target 507051 507051
Original target mean 0.5
New target mean 0.49672715367882125

It took 11.2 seconds to create the dictionary


In [None]:
with open(os.path.join(outdata_path, 'val_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)
with open(os.path.join(outdata_path, 'test_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)

In [26]:
train_vec = np.load(os.path.join(token_path, 'train_vec_chunk.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec_chunk.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec_chunk.npy'))

### Run the model

In [27]:
history_chunk, model, train_pred_probs_chunk, val_pred_probs_chunk = train_model.train_model(((train_vec, train_target_chunk), (val_vec, val_target_chunk)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 21.8 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Train on 1507038 samples, validate on 503395 samples
Epoch 1/1000
 - 1012s - loss: 0.6161 - acc: 0.6540 - val_loss: 0.6037 - val_acc: 0.6689
Epoch 2/1000
 - 943s - loss: 0.5930 - acc: 0.6776 - val_loss: 0.5986 - val_acc: 0.6731
Epoch 3/1000
 - 1015s - loss: 0.5817 - acc: 0.6881 - val_loss: 0.5983 - val_acc: 0.6749
Epoch 4/1000
 - 1480s - loss: 0.5714 - acc: 0.6971 - val_loss: 0.5990 - val_acc: 0.6734
Epoch 5/1000
 - 1492s - loss: 0.5609 - acc: 0.7058 - val_loss: 0.5968 - val_acc: 0.6746
Epoch 6/1000
 - 1490s - loss: 0.5509 - acc: 0.7135 - val_loss: 0.5984 - val_acc: 0.6748
Epoch 7/1000
 - 1488s - loss: 0.5406 - acc: 0.7216 - val_loss: 0.6016 - val_acc: 0.6741
Validation accuracy: 0.674077033996582, loss: 0.6016489580007957


### Score test

In [28]:
model.save(model_dir+'cnn_chunk_model.h5')

In [29]:
test_pred_probs_chunk = model.predict(test_vec)
model.evaluate(test_vec, test_target_chunk, batch_size=128)



[0.6001986647931018, 0.67413926]

In [30]:
np.save(probs_path+'val_pred_probs_cnn_chunk', val_pred_probs_chunk, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn_chunk', test_pred_probs_chunk, allow_pickle=True, fix_imports=True)

### Aggregate validation sample

In [32]:
val_preds_chunk = val_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': val_ids_chunk, 'target': val_target_chunk, 'probs': val_pred_probs_chunk.flatten(), 'preds': val_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000470,0.191976,0.48997,0.340973,1,0.0,0,0
1,1000000543,0.103388,0.491666,0.261479,0,0.0,0,0
2,1000000888,0.330469,0.944494,0.616547,1,0.666667,1,1
3,1000000889,0.273495,0.946655,0.754324,1,0.75,1,1
4,1000000904,0.03257,0.751707,0.499733,1,0.6,0,1


In [33]:
original_df = pd.DataFrame({'target_orig': val_target, 'ids': val_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1010243717,0.276729,0.518066,0.403256,1,0.25,0,0
1,1,1120097305,0.173403,0.886079,0.566222,1,0.5,1,0
2,1,1020183502,0.425271,0.425271,0.425271,1,0.0,0,0
3,1,1080187463,0.148529,0.803117,0.421413,1,0.444444,0,0
4,1,1090010764,0.166191,0.900182,0.547798,1,0.5,1,0


In [34]:
val_pred_aggr_chunk = original_df.preds_probs_mean
val_pred_probs_aggr_chunk = original_df.probs_mean
# val_aggr_target = pred_aggr_df.target
# val_aggr_ids = pred_aggr_df.ids

In [35]:
np.save(probs_path+'val_pred_probs_cnn_aggr_chunk', val_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)

In [43]:
np.mean(val_target==val_pred_aggr_chunk)

0.74484404837683

### Aggregate test sample

In [36]:
test_preds_chunk = test_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': test_ids_chunk, 'target': test_target_chunk, 'probs': test_pred_probs_chunk.flatten(), 'preds': test_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000141,0.315687,0.798025,0.583022,1,0.736842,1,1
1,1000000591,0.07893,0.530047,0.248486,0,0.066667,0,0
2,1000000699,0.205669,0.292894,0.249282,0,0.0,0,0
3,1000000747,0.291679,0.433615,0.362647,0,0.0,0,0
4,1000000750,0.326837,0.986345,0.656591,0,0.5,1,0


In [37]:
original_df = pd.DataFrame({'target_orig': test_target, 'ids': test_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,980211229,0.511725,0.725628,0.652895,1,1.0,1,1
1,1,1120080577,0.274669,0.741413,0.546952,1,0.714286,1,1
2,1,970087805,0.175938,0.937863,0.622395,1,0.666667,1,1
3,1,1070071796,0.620812,0.620812,0.620812,1,1.0,1,1
4,1,1130075886,0.367619,0.922119,0.639309,1,0.647059,1,1


In [38]:
test_pred_aggr_chunk = original_df.preds_probs_mean
test_pred_probs_aggr_chunk = original_df.probs_mean

In [39]:
np.save(probs_path+'test_pred_probs_cnn_aggr_chunk', test_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)