In [4]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle
import gc
import re

import numpy as np
import pandas as pd

from importlib import reload

In [2]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


In [3]:
import tensorflow as tf
tf.__version__

'1.13.0-rc2'

## Load the data

In [4]:
main_dir = '/home/yulia/W266-Final-Project/'

In [5]:
token_path = main_dir+'Classification/data/tokenized/gender'

In [6]:
train_vec = np.load(os.path.join(token_path, 'train_vec.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec.npy'))

In [7]:
word_index = pickle.load(open(os.path.join(token_path, 'word_index.p'), 'rb'))

In [8]:
len(train_vec), len(val_vec), len(test_vec)

(188148, 62716, 62716)

In [12]:
outdata_path = main_dir+'Classification/data/splits/gender/'

In [13]:
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [14]:
sum(train_target), sum(val_target), sum(test_target)

(94074, 31358, 31358)

## CNN model - Full Speech + Glove Embeddings

In [9]:
logs_base_dir = main_dir + 'Classification/logs/gender'

In [10]:
cnn_model_params = {
    'model_type': 'cnn',
    'word_index': word_index,
    'learning_rate': 0.001,
    'layers': 2,
    'epochs': 1000,
    'batch_size': 128,
    'filters': 64,
    'dropout_rate': 0.2,
    'embedding_dim': 200,
    'kernel_size': 5,
    'pool_size': 1,
    'max_num_words': 20000,
    'use_pretrained_embedding': True,
    'is_embedding_trainable': True,
    'glove_dir': main_dir + 'data'
}

In [15]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 36.4 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.2 seconds
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 188148 samples, validate on 62716 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1000
 - 2018s - loss: 0.5992 - acc: 0.6750 - val_loss: 0.5543 - val_acc: 0.7175
Epoch 2/1000
 - 2007s - loss: 0.5468 - acc: 0.7220 - val_loss: 0.5385 - val_acc: 0.7333
Epoch 3/1000
 - 2004s - loss: 0.5216 - acc: 0.7402 - val_loss: 0.5256 - val_acc: 0.7375
Epoch 4/1000
 - 1482s - loss: 0.5015 - acc: 0.7546 - val_loss: 0.5251 - val_acc: 0.7386
Epoch 5/1000
 - 1370s - loss: 0.4832 - acc: 0.7675 - val_loss: 0.5242 - val_acc: 0.7389
Epoch 6/1000
 - 2126s - loss: 0.4657 - a

### Score/save test and validate

In [17]:
model_dir = main_dir + 'Classification/model/gender/'
model.save(model_dir+'cnn_model.h5')

In [18]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.541759143299013, 0.73182285]

In [19]:
probs_path = main_dir+'Classification/data/probs/gender/'

In [20]:
np.save(probs_path+'val_pred_probs_cnn', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn', test_pred_probs, allow_pickle=True, fix_imports=True)

## CNN - chunking the data

### Prepare the data

### Load preprocessed data

In [21]:
outdata_path = main_dir+'Classification/data/splits/gender'

In [22]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)

In [23]:
chunk_len=50

In [24]:
reload(preprocess_data)
start_time = time.time()
train_chunk, train_ids_chunk, train_target_chunk = preprocess_data.split_speech_to_chunks(train, train_ids, train_target, max_len=chunk_len)
val_chunk, val_ids_chunk, val_target_chunk = preprocess_data.split_speech_to_chunks(val, val_ids, val_target, max_len=chunk_len)
test_chunk, test_ids_chunk, test_target_chunk = preprocess_data.split_speech_to_chunks(test, test_ids, test_target, max_len=chunk_len)
print("\nIt took {:.1f} seconds to chunk the data".format(time.time()-start_time))

Original data has 188148 speeches
It was split into 1557233 chunks
Checks on ids and target 1557233 1557233
Original target mean 0.5
New target mean 0.5144747125189358
Original data has 62716 speeches
It was split into 521540 chunks
Checks on ids and target 521540 521540
Original target mean 0.5
New target mean 0.5181059937876289
Original data has 62716 speeches
It was split into 522421 chunks
Checks on ids and target 522421 522421
Original target mean 0.5
New target mean 0.5162503038736957

It took 13.3 seconds to chunk the data


In [25]:
with open(os.path.join(outdata_path, 'val_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)
with open(os.path.join(outdata_path, 'test_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)

In [26]:
train_vec = np.load(os.path.join(token_path, 'train_vec_chunk.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec_chunk.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec_chunk.npy'))

### Run the model

In [36]:
history_chunk, model, train_pred_probs_chunk, val_pred_probs_chunk = train_model.train_model(((train_vec, train_target_chunk), (val_vec, val_target_chunk)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 20.6 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Train on 1557233 samples, validate on 521540 samples
Epoch 1/1000
 - 1102s - loss: 0.6188 - acc: 0.6545 - val_loss: 0.6084 - val_acc: 0.6707
Epoch 2/1000
 - 1511s - loss: 0.5978 - acc: 0.6761 - val_loss: 0.6100 - val_acc: 0.6718
Epoch 3/1000
 - 1515s - loss: 0.5884 - acc: 0.6846 - val_loss: 0.6024 - val_acc: 0.6747
Epoch 4/1000
 - 1503s - loss: 0.5791 - acc: 0.6933 - val_loss: 0.6009 - val_acc: 0.6755
Epoch 5/1000
 - 1514s - loss: 0.5697 - acc: 0.7007 - val_loss: 0.6017 - val_acc: 0.6734
Epoch 6/1000
 - 1511s - loss: 0.5606 - acc: 0.7081 - val_loss: 0.6009 - val_acc: 0.6754
Validation accuracy: 0.6754150986671448, loss: 0.6009258436663877


### Score test

In [37]:
model.save(model_dir+'cnn_chunk_model.h5')

In [38]:
test_pred_probs_chunk = model.predict(test_vec)
model.evaluate(test_vec, test_target_chunk, batch_size=128)



[0.6053227365919072, 0.6713187]

In [39]:
np.save(probs_path+'val_pred_probs_cnn_chunk', val_pred_probs_chunk, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn_chunk', test_pred_probs_chunk, allow_pickle=True, fix_imports=True)

### Aggregate validation sample

In [40]:
val_preds_chunk = val_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': val_ids_chunk, 'target': val_target_chunk, 'probs': val_pred_probs_chunk.flatten(), 'preds': val_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000354,0.225319,0.572533,0.361126,0,0.083333,0,0
1,1000000454,0.353985,0.353985,0.353985,0,0.0,0,0
2,1000000541,0.105641,0.542725,0.293641,0,0.037037,0,0
3,1000000727,0.332692,0.719526,0.429663,0,0.142857,0,0
4,1000000738,0.208273,0.472101,0.326049,0,0.0,0,0


In [41]:
original_df = pd.DataFrame({'target_orig': val_target, 'ids': val_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1090179235,0.523424,0.705958,0.577375,1,1.0,1,1
1,1,1090044795,0.454025,0.615893,0.54214,1,0.666667,1,1
2,1,1020046081,0.525251,0.915206,0.773324,1,1.0,1,1
3,1,1090124860,0.494615,0.829627,0.627593,1,0.833333,1,1
4,1,1140096300,0.324812,0.694769,0.527163,1,0.6,1,1


In [42]:
original_df.target_orig.sum(), original_df.target.sum()

(31358, 31358)

In [44]:
val_pred_aggr_chunk = original_df.preds_probs_mean
val_pred_probs_aggr_chunk = original_df.probs_mean

In [45]:
np.save(probs_path+'val_pred_probs_cnn_aggr_chunk', val_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)

### Aggregate test sample

In [46]:
test_preds_chunk = test_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': test_ids_chunk, 'target': test_target_chunk, 'probs': test_pred_probs_chunk.flatten(), 'preds': test_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000059,0.259346,0.746913,0.498017,1,0.555556,0,1
1,1000000168,0.436655,0.620963,0.53903,0,0.75,1,1
2,1000000353,0.175305,0.626415,0.392085,0,0.307692,0,0
3,1000000358,0.102935,0.503076,0.303005,0,0.5,0,0
4,1000000752,0.17337,0.17337,0.17337,0,0.0,0,0


In [47]:
original_df = pd.DataFrame({'target_orig': test_target, 'ids': test_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1070082863,0.305813,0.413437,0.343526,1,0.0,0,0
1,1,1020062705,0.098636,0.710984,0.461758,1,0.392857,0,0
2,1,1060179537,0.421645,0.988484,0.665958,1,0.545455,1,1
3,1,1130064416,0.600023,0.941471,0.761403,1,1.0,1,1
4,1,1020001636,0.361922,0.630311,0.5276,1,0.666667,1,1


In [48]:
test_pred_aggr_chunk = original_df.preds_probs_mean
test_pred_probs_aggr_chunk = original_df.probs_mean

In [49]:
np.save(probs_path+'test_pred_probs_cnn_aggr_chunk', test_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)