### Set up environnent, access to google drive, import librairies


In [32]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
%tensorflow_version 2.x

In [34]:
# Need for last version of keras for compatibility with gensim
pip install keras --upgrade

Requirement already up-to-date: keras in /usr/local/lib/python3.6/dist-packages (2.3.1)


In [0]:
import os, sys
sys.path.insert(0, os.path.abspath('/content/drive/My Drive/ML_Project2_Final/'))
from models_CNN import *
from helpers import *

path_g = '/content/drive/My Drive/ML_Project2_Final/'

In [0]:
# Import and set seed for reproductibility 
import numpy as np
seed = 7
np.random.seed(seed)

import tensorflow as tf
tf.random.set_seed(seed)

import time
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import gensim
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, Callback
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import keras
import pandas as pd

###Load the data, processed or not and shuffle

In [37]:
# Load full dataset or not: 'f' or 'nf'
full='f'
processed=False

if processed:
    text_data = np.load(path_g + 'Processed_Data/data_train_pr_' + full + '_sl5' + '.npy', allow_pickle=True)
    text_data_test = np.load(path_g + 'Processed_Data/data_test_pr_sl5' + '.npy', allow_pickle=True)
    labels = np.load(path_g + 'Processed_Data/labels_train_'+ full +'_sl5.npy')
    dataset_type = 'processed'

else:
    text_data, labels, text_data_test = get_raw_data(path_g, full)
    dataset_type = 'raw'


(1142838, 2)
(1127644, 2)


In [0]:
perm = np.random.permutation(text_data.shape[0])
text_data = text_data[perm]
labels = labels[perm]

# If we don't want to take full dataset
n_train = -1

if n_train > 0:
    text_data = text_data[:n_train]
    labels = labels[:n_train]


### Train or load gensim Word2Vec models

In [39]:
train_w2v = False
save_w2v = True

# Define gensim model
# Size of embedding
size_w2v = 400
# number of iteration
iter_w2v = 5
# window size
window_w2v = 5
# min count
min_count = 6

if train_w2v:
  # Take all dataset to train gensim word2vec
  text_data_tot = np.concatenate((text_data, text_data_test), axis=0)

  t1 = time.time()


  # Name to save the model afterwards

  path_w2v = path_g + 'w2v_models/'
  name_w2v = 'w2v_model_best'

  # Train gensim model (skipgram)
  model_gs = gensim.models.Word2Vec(text_data_tot, size=size_w2v, window=window_w2v, sg=1,\
                                    min_count=min_count, iter=iter_w2v)
  # Convert to word_vector and save
  word_vector = model_gs.wv
  print("Total time to train gensim", time.time() - t1, "s", flush=True)
  
  if save_w2v:
    word_vector.save(path_w2v+name_w2v)


else:
  # Specify path and name of gensim file
  path_w2v = path_g + 'w2v_models/'
  name_w2v = 'w2v_model_best'
  word_vector = gensim.models.KeyedVectors.load(path_w2v + name_w2v)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


###Insert our Word2Vec model into keras

In [0]:
# Convert gensim word_vector in keras embedding
# Choose or not to continue embedding training during network training
train_emb = True
k_emb = word_vector.get_keras_embedding(train_embeddings=train_emb)
size_emb = k_emb.output_dim

In [0]:
# Convert text to numerical data according to gensim (now keras embedding) vocabulary 
vocabulary = {word: vector.index for word, vector in word_vector.vocab.items()}
tk = Tokenizer(num_words=len(vocabulary))
tk.word_index = vocabulary
num_data = np.asarray((pad_sequences(tk.texts_to_sequences(text_data), padding='post')))
num_data_test = np.asarray((pad_sequences(tk.texts_to_sequences(text_data_test), 
                                          maxlen=num_data.shape[1],padding='post')))

###Define our convolutional network, with parameters

In [42]:
filters, kernel_size, batch_size = 300, 5, 150
epochs = 2
hidden_dims =  250
learning_rate =  0.001
dropout = 0.2


model = build_model_emb(k_emb, filters, kernel_size, hidden_dims, num_data.shape[1], 
                        size_emb, learning_rate, dropout=dropout)
model.summary()

x_train, y_train = num_data, labels

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 400)         28794800  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 300)         600300    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               75250     
_________________________________________________________________
dropout_3 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_3 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                

In [43]:
# Train the model
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fd94fd6d908>

### Generate a submission file

In [0]:
# Predict on test set
y_pred = np.ndarray.flatten(model.predict_classes(num_data_test, batch_size=batch_size))

# Replace for submission
y_pred = np.where(y_pred == 0, -1, y_pred)

In [45]:
# Generate submission
path_csv = path_g
csv_name ='sub_best'

create_csv_submission(y_pred, path_csv + csv_name + '.csv')
print("Output name:", csv_name + '.csv')

Output name: sub_best.csv
