In [1]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle
import gc
import re

import numpy as np
import pandas as pd

from importlib import reload

In [3]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


In [4]:
import tensorflow as tf
tf.__version__

'1.13.0-rc2'

## Load the data

In [5]:
main_dir = '/home/yulia/W266-Final-Project/'

In [6]:
token_path = main_dir+'Classification/data/tokenized/age'

In [7]:
train_vec = np.load(os.path.join(token_path, 'train_vec.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec.npy'))

In [8]:
word_index = pickle.load(open(os.path.join(token_path, 'word_index.p'), 'rb'))

In [9]:
len(train_vec), len(val_vec), len(test_vec)

(229048, 76348, 76352)

In [10]:
outdata_path = main_dir+'Classification/data/splits/age'

In [11]:
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [12]:
sum(train_target), sum(val_target), sum(test_target)

(114524, 38174, 38176)

## CNN model - Full Speech + Glove Embeddings

In [13]:
logs_base_dir = main_dir + 'Classification/logs/age'

In [14]:
cnn_model_params = {
    'model_type': 'cnn',
    'word_index': word_index,
    'learning_rate': 0.001,
    'layers': 2,
    'epochs': 1000,
    'batch_size': 128,
    'filters': 64,
    'dropout_rate': 0.2,
    'embedding_dim': 200,
    'kernel_size': 5,
    'pool_size': 1,
    'max_num_words': 20000,
    'use_pretrained_embedding': True,
    'is_embedding_trainable': True,
    'glove_dir': main_dir + 'data'
}

In [15]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 33.6 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 229048 samples, validate on 76348 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1000
 - 2617s - loss: 0.6650 - acc: 0.6026 - val_loss: 0.6506 - val_acc: 0.6218
Epoch 2/1000
 - 2593s - loss: 0.6459 - acc: 0.6246 - val_loss: 0.6395 - val_acc: 0.6325
Epoch 3/1000
 - 2346s - loss: 0.6312 - acc: 0.6416 - val_loss: 0.6458 - val_acc: 0.6221
Epoch 4/1000
 - 1508s - loss: 0.6151 - acc: 0.6594 - val_loss: 0.6361 - val_acc: 0.6360
Epoch 5/1000
 - 1506s - loss: 0.5966 - acc: 0.6779 - val_loss: 0.6390 - val_acc: 0.6314
Epoch 6/1000
 - 1505s - loss: 0.5758 - a

### Score/save test and validate

In [16]:
model_dir = main_dir + 'Classification/model/age/'
model.save(model_dir+'cnn_model.h5')

In [17]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.645956449316813, 0.6268598]

In [18]:
probs_path = main_dir+'Classification/data/probs/age/'

In [19]:
np.save(probs_path+'val_pred_probs_cnn', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn', test_pred_probs, allow_pickle=True, fix_imports=True)

## CNN - chunk

### Load preprocessed data

In [20]:
outdata_path = main_dir+'Classification/data/splits/age'

In [21]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)

In [22]:
chunk_len=50

In [23]:
reload(preprocess_data)
start_time = time.time()
train_chunk, train_ids_chunk, train_target_chunk = preprocess_data.split_speech_to_chunks(train, train_ids, train_target, max_len=chunk_len)
val_chunk, val_ids_chunk, val_target_chunk = preprocess_data.split_speech_to_chunks(val, val_ids, val_target, max_len=chunk_len)
test_chunk, test_ids_chunk, test_target_chunk = preprocess_data.split_speech_to_chunks(test, test_ids, test_target, max_len=chunk_len)
print("\nIt took {:.1f} seconds to create the dictionary".format(time.time()-start_time))

Original data has 229048 speeches
It was split into 1852321 chunks
Checks on ids and target 1852321 1852321
Original target mean 0.5
New target mean 0.4941962003346072
Original data has 76348 speeches
It was split into 617897 chunks
Checks on ids and target 617897 617897
Original target mean 0.5
New target mean 0.49627850596458634
Original data has 76352 speeches
It was split into 610677 chunks
Checks on ids and target 610677 610677
Original target mean 0.5
New target mean 0.4935931760980027

It took 16.3 seconds to create the dictionary


In [24]:
with open(os.path.join(outdata_path, 'val_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)
with open(os.path.join(outdata_path, 'test_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)

In [25]:
train_vec = np.load(os.path.join(token_path, 'train_vec_chunk.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec_chunk.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec_chunk.npy'))

### Run the model

In [26]:
history_chunk, model, train_pred_probs_chunk, val_pred_probs_chunk = train_model.train_model(((train_vec, train_target_chunk), (val_vec, val_target_chunk)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 19.6 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Train on 1852321 samples, validate on 617897 samples
Epoch 1/1000
 - 1026s - loss: 0.6774 - acc: 0.5674 - val_loss: 0.6752 - val_acc: 0.5762
Epoch 2/1000
 - 1021s - loss: 0.6660 - acc: 0.5906 - val_loss: 0.6732 - val_acc: 0.5785
Epoch 3/1000
 - 1022s - loss: 0.6595 - acc: 0.6015 - val_loss: 0.6748 - val_acc: 0.5752
Epoch 4/1000
 - 1023s - loss: 0.6532 - acc: 0.6107 - val_loss: 0.6735 - val_acc: 0.5774
Validation accuracy: 0.5774489641189575, loss: 0.6734820482685516


### Score test

In [27]:
model.save(model_dir+'cnn_chunk_model.h5')

In [28]:
test_pred_probs_chunk = model.predict(test_vec)
model.evaluate(test_vec, test_target_chunk, batch_size=128)



[0.6727688433602681, 0.5788854]

In [29]:
np.save(probs_path+'val_pred_probs_cnn_chunk', val_pred_probs_chunk, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn_chunk', test_pred_probs_chunk, allow_pickle=True, fix_imports=True)

### Aggregate validation sample

In [30]:
val_preds_chunk = val_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': val_ids_chunk, 'target': val_target_chunk, 'probs': val_pred_probs_chunk.flatten(), 'preds': val_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000059,0.424837,0.678766,0.574726,0,0.888889,1,1
1,1000000124,0.412168,0.57666,0.49743,1,0.666667,0,1
2,1000000354,0.389553,0.547641,0.473952,1,0.25,0,0
3,1000000360,0.053266,0.663706,0.410312,0,0.166667,0,0
4,1000000452,0.432352,0.537042,0.484697,0,0.5,0,0


In [31]:
original_df = pd.DataFrame({'target_orig': val_target, 'ids': val_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1060162543,0.585735,0.632199,0.608967,1,1.0,1,1
1,1,980236360,0.43722,0.902704,0.576761,1,0.73913,1,1
2,1,990257551,0.288472,0.488631,0.421584,1,0.0,0,0
3,1,1010233637,0.384151,0.646413,0.544882,1,0.785714,1,1
4,1,980084907,0.332441,0.54651,0.435048,1,0.095238,0,0


In [32]:
val_pred_aggr_chunk = original_df.preds_probs_mean
val_pred_probs_aggr_chunk = original_df.probs_mean
# val_aggr_target = pred_aggr_df.target
# val_aggr_ids = pred_aggr_df.ids

In [33]:
np.save(probs_path+'val_pred_probs_cnn_aggr_chunk', val_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)

### Aggregate test sample

In [34]:
test_preds_chunk = test_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': test_ids_chunk, 'target': test_target_chunk, 'probs': test_pred_probs_chunk.flatten(), 'preds': test_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000050,0.419159,0.601839,0.519711,1,0.6,1,1
1,1000000123,0.401327,0.57191,0.465891,1,0.2,0,0
2,1000000319,0.16781,0.603652,0.386948,0,0.111111,0,0
3,1000000371,0.206099,0.381643,0.315433,0,0.0,0,0
4,1000000377,0.315352,0.412466,0.353144,0,0.0,0,0


In [35]:
original_df = pd.DataFrame({'target_orig': test_target, 'ids': test_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1050182087,0.415216,0.591632,0.510988,1,0.5,1,0
1,1,1020096707,0.365078,0.645605,0.553838,1,0.727273,1,1
2,1,1120051071,0.562652,0.576362,0.569507,1,1.0,1,1
3,1,970125650,0.478606,0.556128,0.517367,1,0.5,1,0
4,1,980064867,0.459898,0.644666,0.578491,1,0.8,1,1


In [36]:
test_pred_aggr_chunk = original_df.preds_probs_mean
test_pred_probs_aggr_chunk = original_df.probs_mean

In [37]:
np.save(probs_path+'test_pred_probs_cnn_aggr_chunk', test_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)