In [1]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Inspiration for code:

- https://developers.google.com/machine-learning/guides/text-classification/
- https://github.com/google/eng-edu/blob/master/ml/guides/text_classification/load_data.py

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import time
import pickle
import gc
import re

import numpy as np
import pandas as pd

from importlib import reload

In [3]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


In [4]:
import tensorflow as tf
tf.__version__

'1.13.0-rc2'

## Load the data

In [5]:
main_dir = '/home/yulia/W266-Final-Project/'

In [6]:
token_path = main_dir+'Classification/data/tokenized/party'

In [40]:
train_vec = np.load(os.path.join(token_path, 'train_vec.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec.npy'))

In [8]:
word_index = pickle.load(open(os.path.join(token_path, 'word_index.p'), 'rb'))

In [9]:
len(train_vec), len(val_vec), len(test_vec)

(303459, 101153, 101154)

In [10]:
outdata_path = main_dir+'Classification/data/splits/party'

In [11]:
with open(os.path.join(outdata_path, 'train_target'), 'rb') as fp:
    train_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_target'), 'rb') as fp:
    val_target = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_target'), 'rb') as fp:
    test_target = pickle.load(fp)

In [12]:
sum(train_target), sum(val_target), sum(test_target)

(161472, 53824, 53825)

## CNN model - Full Speech + Glove Embeddings

In [13]:
logs_base_dir = main_dir + 'Classification/logs/party'

In [14]:
cnn_model_params = {
    'model_type': 'cnn',
    'word_index': word_index,
    'learning_rate': 0.001,
    'layers': 2,
    'epochs': 1000,
    'batch_size': 128,
    'filters': 64,
    'dropout_rate': 0.2,
    'embedding_dim': 200,
    'kernel_size': 5,
    'pool_size': 1,
    'max_num_words': 20000,
    'use_pretrained_embedding': True,
    'is_embedding_trainable': True,
    'glove_dir': main_dir + 'data'
}

In [43]:
reload(train_model)
history, model, train_pred_probs, val_pred_probs = train_model.train_model(((train_vec, train_target), (val_vec, val_target)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 33.6 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.1 seconds
Train on 303459 samples, validate on 101153 samples
Epoch 1/1000
 - 3349s - loss: 0.6695 - acc: 0.5700 - val_loss: 0.6381 - val_acc: 0.6203
Epoch 2/1000
 - 3327s - loss: 0.6196 - acc: 0.6382 - val_loss: 0.6094 - val_acc: 0.6487
Epoch 3/1000
 - 2733s - loss: 0.5915 - acc: 0.6662 - val_loss: 0.6038 - val_acc: 0.6523
Epoch 4/1000
 - 2098s - loss: 0.5710 - acc: 0.6852 - val_loss: 0.5980 - val_acc: 0.6547
Epoch 5/1000
 - 2027s - loss: 0.5518 - acc: 0.7016 - val_loss: 0.5972 - val_acc: 0.6543
Epoch 6/1000
 - 2031s - loss: 0.5340 - acc: 0.7167 - val_loss: 0.5971 - val_acc: 0.6565
Epoch 7/1000
 - 2035s - loss: 0.5145 - acc: 0.7304 - val_loss: 0.6017 - val_acc: 0.6557
Epoch 8/1000
 - 2038s - loss: 0.4936 - acc: 0.7462 - val_loss: 0.6057 - val_acc: 0.6558
Validation accuracy: 0.655798614025116, loss: 0.6057328982666599


### Score/save test and validate

In [44]:
model_dir = main_dir + 'Classification/model/party/'
model.save(model_dir+'cnn_model.h5')

In [45]:
test_pred_probs = model.predict(test_vec)
model.evaluate(test_vec, test_target, batch_size=128)



[0.6049941708015505, 0.6550013]

In [46]:
probs_path = main_dir+'Classification/data/probs/party/'

In [47]:
np.save(probs_path+'val_pred_probs_cnn', val_pred_probs, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn', test_pred_probs, allow_pickle=True, fix_imports=True)

## CNN - chunk

### Load preprocessed data

In [48]:
outdata_path = main_dir+'Classification/data/splits/party'

In [49]:
with open(os.path.join(outdata_path, 'train_list'), 'rb') as fp:
    train = pickle.load(fp)
with open(os.path.join(outdata_path, 'train_ids'), 'rb') as fp:
    train_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_list'), 'rb') as fp:
    val = pickle.load(fp)
with open(os.path.join(outdata_path, 'val_ids'), 'rb') as fp:
    val_ids = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_list'), 'rb') as fp:
    test = pickle.load(fp)
with open(os.path.join(outdata_path, 'test_ids'), 'rb') as fp:
    test_ids = pickle.load(fp)

In [50]:
chunk_len=50

In [51]:
reload(preprocess_data)
start_time = time.time()
train_chunk, train_ids_chunk, train_target_chunk = preprocess_data.split_speech_to_chunks(train, train_ids, train_target, max_len=chunk_len)
val_chunk, val_ids_chunk, val_target_chunk = preprocess_data.split_speech_to_chunks(val, val_ids, val_target, max_len=chunk_len)
test_chunk, test_ids_chunk, test_target_chunk = preprocess_data.split_speech_to_chunks(test, test_ids, test_target, max_len=chunk_len)
print("\nIt took {:.1f} seconds to create the dictionary".format(time.time()-start_time))

Original data has 303459 speeches
It was split into 2443603 chunks
Checks on ids and target 2443603 2443603
Original target mean 0.5321048312951667
New target mean 0.5447693426469029
Original data has 101153 speeches
It was split into 819712 chunks
Checks on ids and target 819712 819712
Original target mean 0.5321048312951667
New target mean 0.5479839260618363
Original data has 101154 speeches
It was split into 819495 chunks
Checks on ids and target 819495 819495
Original target mean 0.5321094568677462
New target mean 0.5499813909785904

It took 24.5 seconds to create the dictionary


In [52]:
with open(os.path.join(outdata_path, 'val_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)
with open(os.path.join(outdata_path, 'test_ids_chunk'), 'wb') as fp:
    pickle.dump(val_ids_chunk, fp)

In [53]:
train_vec = np.load(os.path.join(token_path, 'train_vec_chunk.npy'))
val_vec = np.load(os.path.join(token_path, 'val_vec_chunk.npy'))
test_vec = np.load(os.path.join(token_path, 'test_vec_chunk.npy'))

### Run the model

In [54]:
history_chunk, model, train_pred_probs_chunk, val_pred_probs_chunk = train_model.train_model(((train_vec, train_target_chunk), (val_vec, val_target_chunk)), logs_base_dir, **cnn_model_params)

Found 400000 word vectors.
It took 21.7 seconds
Preparing embedding matrix.
Embedding matrix has been built.
Its shape is (20000, 200).
It took 0.2 seconds
Train on 2443603 samples, validate on 819712 samples
Epoch 1/1000
 - 1360s - loss: 0.6576 - acc: 0.5968 - val_loss: 0.6513 - val_acc: 0.6105
Epoch 2/1000
 - 1417s - loss: 0.6407 - acc: 0.6207 - val_loss: 0.6468 - val_acc: 0.6101
Epoch 3/1000
 - 1594s - loss: 0.6328 - acc: 0.6302 - val_loss: 0.6437 - val_acc: 0.6160
Epoch 4/1000
 - 1367s - loss: 0.6257 - acc: 0.6377 - val_loss: 0.6421 - val_acc: 0.6174
Epoch 5/1000
 - 1369s - loss: 0.6189 - acc: 0.6450 - val_loss: 0.6424 - val_acc: 0.6164
Epoch 6/1000
 - 1362s - loss: 0.6124 - acc: 0.6514 - val_loss: 0.6422 - val_acc: 0.6180
Validation accuracy: 0.6179901957511902, loss: 0.6422306312426115


### Score test

In [55]:
model.save(model_dir+'cnn_chunk_model.h5')

In [56]:
test_pred_probs_chunk = model.predict(test_vec)
model.evaluate(test_vec, test_target_chunk, batch_size=128)



[0.6409181041485772, 0.6196792]

In [57]:
np.save(probs_path+'val_pred_probs_cnn_chunk', val_pred_probs_chunk, allow_pickle=True, fix_imports=True)
np.save(probs_path+'test_pred_probs_cnn_chunk', test_pred_probs_chunk, allow_pickle=True, fix_imports=True)

### Aggregate validation sample

In [58]:
val_preds_chunk = val_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': val_ids_chunk, 'target': val_target_chunk, 'probs': val_pred_probs_chunk.flatten(), 'preds': val_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000136,0.460036,0.667681,0.559792,0,0.666667,1,1
1,1000000164,0.512244,0.512244,0.512244,1,1.0,1,1
2,1000000280,0.415746,0.525855,0.4708,0,0.5,0,0
3,1000000329,0.337819,0.956316,0.610796,1,0.75,1,1
4,1000000342,0.414675,0.850297,0.595731,1,0.875,1,1


In [59]:
original_df = pd.DataFrame({'target_orig': val_target, 'ids': val_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,1020203893,0.384761,0.54071,0.462735,1,0.5,0,0
1,1,1030062300,0.462883,0.729484,0.608891,1,0.666667,1,1
2,1,1100205980,0.436971,0.538937,0.498121,1,0.6,0,1
3,1,990019658,0.522886,0.805554,0.682641,1,1.0,1,1
4,1,1130043981,0.482554,0.694468,0.558201,1,0.6,1,1


In [60]:
val_pred_aggr_chunk = original_df.preds_probs_mean
val_pred_probs_aggr_chunk = original_df.probs_mean
# val_aggr_target = pred_aggr_df.target
# val_aggr_ids = pred_aggr_df.ids

In [61]:
np.save(probs_path+'val_pred_probs_cnn_aggr_chunk', val_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)

### Aggregate test sample

In [62]:
test_preds_chunk = test_pred_probs_chunk > 0.5
pred_df = pd.DataFrame({'ids': test_ids_chunk, 'target': test_target_chunk, 'probs': test_pred_probs_chunk.flatten(), 'preds': test_preds_chunk.flatten()})
f = {'probs': [min, max, 'mean']}
pred_aggr_df = pred_df.groupby('ids').agg(f).reset_index()
pred_aggr_df.columns = ['ids', 'probs_min', 'probs_max', 'probs_mean']
pred_aggr_df = pred_aggr_df.merge(pred_df[['target', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df = pred_aggr_df.merge(pred_df[['preds', 'ids']].groupby('ids').mean().reset_index(), on='ids')
pred_aggr_df['preds_probs_mean'] = pred_aggr_df['probs_mean'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df['preds_mean'] = pred_aggr_df['preds'].apply(lambda x: 1 if x > 0.5 else 0)
pred_aggr_df.head()

Unnamed: 0,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1000000048,0.206834,0.510755,0.395217,0,0.142857,0,0
1,1000000058,0.458263,0.458263,0.458263,0,0.0,0,0
2,1000000124,0.605943,0.814933,0.714888,1,1.0,1,1
3,1000000141,0.455183,0.753634,0.601842,1,0.894737,1,1
4,1000000187,0.43537,0.43537,0.43537,0,0.0,0,0


In [63]:
original_df = pd.DataFrame({'target_orig': test_target, 'ids': test_ids})
original_df = original_df.merge(pred_aggr_df, on="ids")
original_df.head()

Unnamed: 0,target_orig,ids,probs_min,probs_max,probs_mean,target,preds,preds_probs_mean,preds_mean
0,1,970231787,0.553563,0.629133,0.581662,1,1.0,1,1
1,1,980252662,0.485569,0.683693,0.569601,1,0.833333,1,1
2,1,1060172004,0.535957,0.81705,0.694592,1,1.0,1,1
3,1,1110115432,0.446443,0.728246,0.610706,1,0.666667,1,1
4,1,1030177793,0.433679,0.471363,0.452521,1,0.0,0,0


In [64]:
test_pred_aggr_chunk = original_df.preds_probs_mean
test_pred_probs_aggr_chunk = original_df.probs_mean

In [65]:
np.save(probs_path+'test_pred_probs_cnn_aggr_chunk', test_pred_probs_aggr_chunk, allow_pickle=True, fix_imports=True)