# Text Classification - Small Data

Set GPU environment variables

In [None]:
import numpy as np
import os
import GPUtil

Availability=GPUtil.getAvailability(GPUtil.getGPUs())
all_gpus = np.arange(3)
available_gpu_indexes = [x for x in all_gpus if Availability[x]]
NUMBER_OF_GPUS_TO_USE = len(available_gpu_indexes)
# Set CUDA_DEVICE_ORDER so the IDs assigned by CUDA match those from nvidia-smi
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first NUMBER_OF_GPUS_TO_USE available device id
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(np.array(available_gpu_indexes[:NUMBER_OF_GPUS_TO_USE]).astype(str))

Set `keras` session

In [None]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
set_session(tf.Session(config=config))

`imports`

In [2]:
import warnings; warnings.filterwarnings('ignore') 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn import metrics
import numpy as np
import pandas as pd
import bz2, glob, os, xgboost, pickle

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import decomposition, ensemble
from collections import OrderedDict
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

Define the `score` to evaluate each classifier.

In [3]:
def score_classifier(y_true, y_pred):
    return metrics.f1_score(
        y_true, y_pred,
        average='macro', #Calculate metrics for each label, and find their unweighted mean.
        #This does not take label imbalance into account.
    )

Load data and create dataframes.

In [4]:
train_file = bz2.BZ2File('training-data-small.txt.bz2')

In [5]:
test_file = bz2.BZ2File('test-data-small.txt.bz2')

In [6]:
labels, texts = [],[]
for i, line in enumerate(train_file.readlines(-1)):
    content = line.decode("utf-8").split("\t")
    labels.append(content[0])
    texts.append(content[1].split("\n")[0])
    
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [7]:
texts = []
for i, line in enumerate(test_file.readlines(-1)):
    content = line.decode("utf-8").split("\t")
    texts.append(content[0].split("\n")[0])
    
testDF = pd.DataFrame()
testDF['text'] = texts
testDF['label'] = ""

split the dataset into training and validation datasets 

In [8]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.1)

In [9]:
models = {}

In [10]:
trainDF.head()

Unnamed: 0,text,label
0,"X773579,Y2640,Y2072,Z4,Z15",0
1,"X166074297,X123474229,X147204623,X51578397,X23...",0
2,"X374616379,X773579,X344420902,Y1940,Y1705,Z4,Z...",1
3,"X103413307,X37875376,X62716332,X277692318,X344...",0
4,"X123474229,X551805107,X62716661,Y2307,Y2,Y1222...",0


### Methodology

I see that the text I need to process is a series of encoded words. I dont have access to the original words, therefore I cannot use words embeddings that are available online. The order of these encoded words may or maynot have a meaning. If there is a meaning I trust `CNN` or `RNN` based NN variants would be able to figure it out. 

Assuming the order of the encoded words does not have a meaning, I first try bag of words model. In this model, I segment each text into words (from the value counts it is safe to assume they are words), and count the number of times each word occurs in each text and assign each word an integer id. Each unique word will correspond to a feature.

To find a classifier, I first obtain the classifier for various classification methods, optimize the parameters for it, and finally create a vote ensemble. 

I also tried various NN methods after that to see if there is any improvement in the performance. If I have enough time I want to create an ensemble of all.

In [21]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(train_x, train_y)

# Performance of NB Classifier
predicted = text_clf.predict(valid_x)
metrics.accuracy_score(predicted, valid_y)

# Grid Search
# Creating a list of parameters for which I would like to do performance tuning. 
# Since my time is limited I cannot afford to explore the entire parameter space.
parameters = {
    'vect__ngram_range': [(1,1), (1,2), (1,3)], 
    'tfidf__use_idf': (True, False), 
    'clf__alpha': [.01,.1,1],
}

# Create an instance of the grid search by passing the classifier, parameters .
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_x, train_y)

gs_clf.best_score_
gs_clf.best_params_

predicted = gs_clf.predict(valid_x)
metrics.accuracy_score(predicted, valid_y)

models['nb'] = gs_clf.best_estimator_

0.717

0.7387777777777778

{'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

0.751

In [18]:
# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(eta0=.1, penalty='l2', n_jobs=-1))]) 

text_clf_svm = text_clf_svm.fit(train_x, train_y)
predicted_svm = text_clf_svm.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)

# Similarly doing grid search for SVM
parameters_svm = {
    'vect__ngram_range': [(1,1), (1,2), (1,3)], 
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': [.001,.0001],
    'clf-svm__loss':('hinge', 'squared_hinge'),
    'clf-svm__learning_rate':('constant','optimal','invscaling'),
}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, cv=5, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(train_x, train_y)


gs_clf_svm.best_score_
gs_clf_svm.best_params_

predicted_svm = gs_clf_svm.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)

models['svm'] = gs_clf_svm.best_estimator_

0.755

0.7692222222222223

{'clf-svm__alpha': 0.0001,
 'clf-svm__learning_rate': 'constant',
 'clf-svm__loss': 'hinge',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 1)}

0.763

In [24]:
# Training Support Vector Machines - SVM and calculating its performance
# Unfortunately 'hinge' loss cannot be used in the Vote Classifier I used in the ensemble.
# I need to try another loss, although technically it may not be SVM anymore.
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(eta0=.1, n_jobs=-1))]) 

text_clf_svm = text_clf_svm.fit(train_x, train_y)
predicted_svm = text_clf_svm.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)

# Similarly doing grid search for SVM
parameters_svm = {
    'vect__ngram_range': [(1,1), (1,2), (1,3)], 
    'tfidf__use_idf': (True, False),
    'clf-svm__penalty':('l2','l1','elasticnet'),
    'clf-svm__loss':('log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
    'clf-svm__learning_rate':('constant','optimal','invscaling'),
}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, cv=5, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(train_x, train_y)


gs_clf_svm.best_score_
gs_clf_svm.best_params_

predicted_svm = gs_clf_svm.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)

models['svm'] = gs_clf_svm.best_estimator_

0.755

0.7713333333333333

{'clf-svm__learning_rate': 'constant',
 'clf-svm__loss': 'log',
 'clf-svm__penalty': 'l2',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 1)}

0.769

In [25]:
text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-lr', linear_model.LogisticRegression(n_jobs=-1))])

text_clf_lr = text_clf_lr.fit(train_x, train_y)
predicted_lr = text_clf_lr.predict(valid_x)
metrics.accuracy_score(predicted_lr, valid_y)

parameters_lr = {'vect__ngram_range': [(1,1),(1,2)], 
                 'tfidf__use_idf': (True, False),
                 'clf-lr__C': [.1,1,10],
                 'clf-lr__solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
                }

gs_clf_lr = GridSearchCV(text_clf_lr, parameters_lr, cv=5, n_jobs=-1)
gs_clf_lr = gs_clf_lr.fit(train_x, train_y)


gs_clf_lr.best_score_
gs_clf_lr.best_params_

predicted_lr = gs_clf_lr.predict(valid_x)
metrics.accuracy_score(predicted_lr, valid_y)

models['lr'] = gs_clf_lr.best_estimator_

0.768

0.7713333333333333

{'clf-lr__C': 1,
 'clf-lr__solver': 'newton-cg',
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 1)}

0.768

In [26]:
text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-rf', ensemble.RandomForestClassifier(n_jobs=-1))])

text_clf_rf = text_clf_rf.fit(train_x, train_y)
predicted_rf = text_clf_rf.predict(valid_x)
metrics.accuracy_score(predicted_rf, valid_y)

parameters_rf = {'vect__ngram_range': [(1,1),(1,2)], 
                 'tfidf__use_idf': (True, False),
                 'clf-rf__n_estimators': [10,50,100],
                 'clf-rf__max_features': ('auto','sqrt','log2'),
                }

gs_clf_rf = GridSearchCV(text_clf_rf, parameters_rf, cv=5, n_jobs=-1)
gs_clf_rf = gs_clf_rf.fit(train_x, train_y)

gs_clf_rf.best_score_
gs_clf_rf.best_params_

predicted_rf = gs_clf_rf.predict(valid_x)
metrics.accuracy_score(predicted_rf, valid_y)

models['rf'] = gs_clf_rf.best_estimator_

0.723

0.7473333333333333

{'clf-rf__max_features': 'log2',
 'clf-rf__n_estimators': 100,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 1)}

0.751

In [29]:
text_clf_xgb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-xgb', xgboost.XGBClassifier(learning_rate=.1, n_jobs=-1))])

text_clf_xgb = text_clf_xgb.fit(train_x, train_y)
predicted_xgb = text_clf_xgb.predict(valid_x)
metrics.accuracy_score(predicted_xgb, valid_y)

parameters_xgb = {'vect__ngram_range': [(1,1),(1,2)], 
                  'tfidf__use_idf': (True, False),
                  'clf-xgb__max_depth': [3,5],
                  'clf-xgb__n_estimators':[100,200],
                 }

gs_clf_xgb = GridSearchCV(text_clf_xgb, parameters_xgb, cv=5, n_jobs=-1)
gs_clf_xgb = gs_clf_xgb.fit(train_x, train_y)

gs_clf_xgb.best_score_
gs_clf_xgb.best_params_

predicted_xgb = gs_clf_xgb.predict(valid_x)
metrics.accuracy_score(predicted_xgb, valid_y)

models['xgb'] = gs_clf_xgb.best_estimator_

0.757

0.7702222222222223

{'clf-xgb__max_depth': 3,
 'clf-xgb__n_estimators': 200,
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 1)}

0.76

In [30]:
text_clf_knn = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-knn', KNeighborsClassifier(n_neighbors=10, p=1, algorithm='ball_tree', n_jobs=-1))])

text_clf_knn = text_clf_knn.fit(train_x, train_y)
predicted_knn = text_clf_knn.predict(valid_x)
metrics.accuracy_score(predicted_knn, valid_y)

parameters_knn = {
    'vect__ngram_range': [(1,1), (1,2)], 
    'tfidf__use_idf': (True, False),
#    'clf-knn__n_neighbors': [1,5,10],
#    'clf-knn__weights':('distance','uniform'),
#    'clf-knn__algorithm':('ball_tree', 'kd_tree', 'brute'),
#    'clf-knn__p':[1,2],
    }

gs_clf_knn = GridSearchCV(text_clf_knn, parameters_knn, cv=5, n_jobs=-1)
gs_clf_knn = gs_clf_knn.fit(train_x, train_y)

gs_clf_knn.best_score_
gs_clf_knn.best_params_

predicted_knn = gs_clf_knn.predict(valid_x)
metrics.accuracy_score(predicted_knn, valid_y)

models['knn'] = gs_clf_knn.best_estimator_

0.705

0.7082222222222222

{'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}

0.705

In [46]:
text_clf_mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-mlp', MLPClassifier(hidden_layer_sizes=(512,32,64), early_stopping=True))])

text_clf_mlp = text_clf_mlp.fit(train_x, train_y)
predicted_mlp = text_clf_mlp.predict(valid_x)
metrics.accuracy_score(predicted_mlp, valid_y)

parameters_mlp = {
#    'vect__ngram_range': [(1,1), (1,2)], 
#    'tfidf__use_idf': (True, False),
#    'clf-mlp__activation':('logistic', 'tanh', 'relu'),
#    'clf-mlp__solver':('lbfgs', 'sgd', 'adam'),
#    'clf-mlp__learning_rate':('constant', 'invscaling', 'adaptive'),    
    }
# It was taking too long so I skip parameter optimization for MLP.

gs_clf_mlp = GridSearchCV(text_clf_mlp, parameters_mlp,  cv=5)
gs_clf_mlp = gs_clf_mlp.fit(train_x, train_y)

gs_clf_mlp.best_score_
gs_clf_mlp.best_params_

predicted_mlp = gs_clf_mlp.predict(valid_x)
metrics.accuracy_score(predicted_mlp, valid_y)

models['mlp'] = gs_clf_mlp.best_estimator_

0.773

0.7696666666666667

{}

0.763

Score each classifier, I have calculated so far:

In [47]:
score_dict = OrderedDict([( key, score_classifier( y_true=valid_y, y_pred=model.predict(valid_x) ) )  for (key, model) in models.items()])
score_dict

OrderedDict([('nb', 0.7215932914046123),
             ('svm', 0.7424631449152079),
             ('lr', 0.7396476265290091),
             ('rf', 0.7049675642051009),
             ('xgb', 0.7329998798499459),
             ('knn', 0.6585011790391497),
             ('mlp', 0.7191859948458189)])

Create voting ensemble.

In [53]:
vc = VotingClassifier(estimators=list(models.items()))
vc.fit(train_x, train_y)
preds = vc.predict(valid_x)

VotingClassifier(estimators=[('nb', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), pre...ue, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [56]:
models['vc'] = vc

Calculate scores of the ensemble and see the improvement. 

In [82]:
score_classifier(y_true=valid_y, y_pred=preds)

0.8114828843995511

In [83]:
print(metrics.classification_report(y_true=valid_y, y_pred=preds))

             precision    recall  f1-score   support

          0       0.84      0.90      0.87       627
          1       0.80      0.71      0.76       373

avg / total       0.83      0.83      0.83      1000



In [84]:
print(metrics.accuracy_score(y_true=valid_y, y_pred=preds))

0.828


Save the model for later use.

In [79]:
# save the model to disk
filename = 'vc_model_small.sav'

In [66]:
pickle.dump(vc, open(filename, 'wb')) 

In [10]:
# load the model from disk
filename = 'vc_model_small.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(valid_x, valid_y)
preds = loaded_model.predict(valid_x)
print(result)

  if diff:
  if diff:
  if diff:


0.84


  if diff:


Since test data is too large to predict all at once, divide it into small parts and combine them after prediction.

In [175]:
test_predict=[]
locs = np.linspace(0,len(testDF),100,dtype=int)
for ind, loc in enumerate(locs[:-1]):
    test_predict.append(vc.predict(testDF.text.iloc[locs[ind]:locs[ind+1]]))
test_predict = np.concatenate(test_predict[:])

In [177]:
testDF.label = test_predict

In [180]:
testDF.to_pickle('test-data-small-predictions.bz2')

Next, try NN models that rely on word embeddings to see if they can do better.

Since the language of the data is unknown, I create word embeddings from the data itself.

In [10]:
from keras.preprocessing import text, sequence
from gensim.models import Word2Vec, KeyedVectors
from gensim.sklearn_api import w2vmodel
import gensim

In [11]:
maxlen= 150
size =100

I should create the word embeddings from tha largest corpus I can get my hands on.

In [12]:
w2v = Word2Vec(testDF.text.apply(lambda x:x.split(",")), size=size, negative=0)

In [13]:
w2v.wv.save_word2vec_format('model-small.bin')

In [14]:
w2v = KeyedVectors.load_word2vec_format('model-small.bin')

In [15]:
words = list(w2v.wv.vocab)

In [16]:
max_features = len(words)

In [18]:
token = text.Tokenizer(lower=False)
# token.fit_on_texts(train_x)
token.fit_on_texts(trainDF.text)

train_x_t2s = (token.texts_to_sequences(train_x)) 
valid_x_t2s = (token.texts_to_sequences(valid_x)) 

train_seq_x = sequence.pad_sequences(train_x_t2s, maxlen=maxlen)
valid_seq_x = sequence.pad_sequences(valid_x_t2s, maxlen=maxlen)

In [21]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(token.word_index) + 1, size))
for word, i in token.word_index.items():
    try:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] =np.random.randn(size)

In [27]:
from keras import layers, models, optimizers
from keras.backend import expand_dims
from keras.utils import multi_gpu_model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras import regularizers

In [24]:
cb = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=0, mode='auto', baseline=None)

In [None]:
nn_models = {}

In [34]:
filepath="best_models/cnn_lstm/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# BLSTM-2DCNN
def create_cnn_lstm():
    inp = layers.Input(shape=(maxlen, ))
    x = layers.Embedding(max_features+1, size, weights=[embedding_matrix], input_length=maxlen)(inp)
    x = layers.Dropout(0.5)(x)    

    x = layers.Bidirectional(layers.CuDNNLSTM(300, return_sequences=True), merge_mode='sum')(x)
    x = layers.Dropout(0.2)(x)
#     x = expand_dims(x, axis=-1)
    print(x)
    x = layers.Reshape((150, 300, 1))(x)

    x = layers.Conv2D(100, kernel_size=(5,5), padding='valid', kernel_initializer='glorot_uniform')(x)
    x = layers.MaxPool2D(pool_size=(5,5))(x)

    x = layers.Flatten()(x)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(1, activation = "sigmoid",  kernel_regularizer=regularizers.l2(.00001))(x)
    
    parallel_model = models.Model(inputs = inp, outputs = x)
    parallel_model = multi_gpu_model(parallel_model, gpus=NUMBER_OF_GPUS_TO_USE)
    parallel_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return parallel_model

clf_cnn_lstm = create_cnn_lstm()

hist_cnn_lstm = clf_cnn_lstm.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_cnn_lstm.load_weights(filepath)
predicted_cnn_lstm = clf_cnn_lstm.predict(valid_seq_x)
print("CNN, Word Embeddings",  metrics.accuracy_score(np.round(predicted_cnn_lstm).astype(int).astype(str), valid_y))

Tensor("dropout_14/cond/Merge:0", shape=(?, 150, 300), dtype=float32)
Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.78900, saving model to best_models/cnn_lstm/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc did not improve from 0.78900
Epoch 3/50

Epoch 00003: val_acc did not improve from 0.78900
Epoch 4/50

Epoch 00004: val_acc did not improve from 0.78900
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.78900
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.78900
CNN, Word Embeddings 0.789


In [41]:
nn_models['cnn_lstm'] = clf_cnn_lstm

In [35]:
filepath="best_models/cnn/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(token.word_index)+1, size, weights=[embedding_matrix], trainable=False)(input_layer)
    #embedding_layer = w2v.get_keras_embedding()(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 10, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)    
    model = multi_gpu_model(model, gpus=NUMBER_OF_GPUS_TO_USE)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

clf_cnn = create_cnn()

hist_cnn = clf_cnn.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_cnn.load_weights(filepath)
predicted_cnn = clf_cnn.predict(valid_seq_x)
print("CNN, Word Embeddings",  metrics.accuracy_score(np.round(predicted_cnn).astype(int).astype(str), valid_y))

Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.68000, saving model to best_models/cnn/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.68000 to 0.73800, saving model to best_models/cnn/weights.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.73800 to 0.74700, saving model to best_models/cnn/weights.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.74700 to 0.76900, saving model to best_models/cnn/weights.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.76900
Epoch 6/50

Epoch 00006: val_acc improved from 0.76900 to 0.77100, saving model to best_models/cnn/weights.hdf5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.77100
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.77100
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.77100
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.77100
Epoch 11/50

Epoch 00011: val_acc did not improve from 0.77100
CNN, Word Embeddings 

In [42]:
nn_models['cnn'] = clf_cnn

In [36]:
filepath="best_models/rnn_lstm/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(token.word_index) + 1, size, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.CuDNNLSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model = multi_gpu_model(model, gpus=NUMBER_OF_GPUS_TO_USE)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

clf_rnn = create_rnn_lstm()
hist_rnn = clf_rnn.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_rnn.load_weights(filepath)
predicted_rnn = clf_rnn.predict(valid_seq_x)
print("RNN-LSTM, Word Embeddings",  metrics.accuracy_score(np.round(predicted_rnn).astype(int).astype(str), valid_y))

Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.62700, saving model to best_models/rnn_lstm/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc did not improve from 0.62700
Epoch 3/50

Epoch 00003: val_acc improved from 0.62700 to 0.65100, saving model to best_models/rnn_lstm/weights.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.65100 to 0.70700, saving model to best_models/rnn_lstm/weights.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.70700
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.70700
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.70700
Epoch 8/50

Epoch 00008: val_acc improved from 0.70700 to 0.74000, saving model to best_models/rnn_lstm/weights.hdf5
Epoch 9/50

Epoch 00009: val_acc improved from 0.74000 to 0.74400, saving model to best_models/rnn_lstm/weights.hdf5
Epoch 10/50

Epoch 00010: val_acc improved from 0.74400 to 0.76200, saving model to best_models/rnn_lstm/weights.hdf5
Epoc

In [43]:
nn_models['rnn_lstm'] = clf_rnn

In [37]:
filepath="best_models/rnn_gru/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(token.word_index) + 1, size, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.CuDNNGRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model = multi_gpu_model(model, gpus=NUMBER_OF_GPUS_TO_USE)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

clf_gru = create_rnn_gru()
hist_gru = clf_gru.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_gru.load_weights(filepath)
predicted_gru = clf_gru.predict(valid_seq_x)
print("RNN-GRU, Word Embeddings",  metrics.accuracy_score(np.round(predicted_gru).astype(int).astype(str), valid_y))

Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.62700, saving model to best_models/rnn_gru/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.62700 to 0.68900, saving model to best_models/rnn_gru/weights.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.68900 to 0.71800, saving model to best_models/rnn_gru/weights.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.71800 to 0.72200, saving model to best_models/rnn_gru/weights.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.72200
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.72200
Epoch 7/50

Epoch 00007: val_acc improved from 0.72200 to 0.75200, saving model to best_models/rnn_gru/weights.hdf5
Epoch 8/50

Epoch 00008: val_acc improved from 0.75200 to 0.75800, saving model to best_models/rnn_gru/weights.hdf5
Epoch 9/50

Epoch 00009: val_acc improved from 0.75800 to 0.76700, saving model to best_models/rnn_gru/weights.hdf5
Epoch 10/50

Epoch 

In [44]:
nn_models['rnn_gru'] = clf_gru

In [38]:
filepath="best_models/bidirectional_rnn/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(token.word_index) + 1, size, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.CuDNNGRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model = multi_gpu_model(model, gpus=NUMBER_OF_GPUS_TO_USE)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

clf_rnnbi = create_bidirectional_rnn()
hist_rnnbi = clf_rnnbi.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_rnnbi.load_weights(filepath)
predicted_rnnbi = clf_rnnbi.predict(valid_seq_x)
print("RNN-Gbidirectional, Word Embeddings",  metrics.accuracy_score(np.round(predicted_rnnbi).astype(int).astype(str), valid_y))

Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.62700, saving model to best_models/bidirectional_rnn/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.62700 to 0.63600, saving model to best_models/bidirectional_rnn/weights.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.63600 to 0.70200, saving model to best_models/bidirectional_rnn/weights.hdf5
Epoch 4/50

Epoch 00004: val_acc did not improve from 0.70200
Epoch 5/50

Epoch 00005: val_acc improved from 0.70200 to 0.73400, saving model to best_models/bidirectional_rnn/weights.hdf5
Epoch 6/50

Epoch 00006: val_acc improved from 0.73400 to 0.74400, saving model to best_models/bidirectional_rnn/weights.hdf5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.74400
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.74400
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.74400
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.74400
Epoch 11/50

E

In [45]:
nn_models['rnnbi'] = clf_rnnbi

In [39]:
filepath="best_models/rcnn/weights.hdf5"
chkpt = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((maxlen, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(token.word_index) + 1, size, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.CuDNNGRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 10, activation="selu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="selu")(pooling_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model = multi_gpu_model(model, gpus=NUMBER_OF_GPUS_TO_USE)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

clf_rcnn = create_rcnn()
hist_rcnn = clf_rcnn.fit(train_seq_x, train_y, epochs=50, callbacks=[cb, chkpt], validation_data=(valid_seq_x, valid_y))
clf_rcnn.load_weights(filepath)
predicted_rcnn = clf_rcnn.predict(valid_seq_x)
print("RCNN, Word Embeddings",  metrics.accuracy_score(np.round(predicted_rcnn).astype(int).astype(str), valid_y))

Train on 9000 samples, validate on 1000 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.73300, saving model to best_models/rcnn/weights.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.73300 to 0.75200, saving model to best_models/rcnn/weights.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.75200 to 0.77700, saving model to best_models/rcnn/weights.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.77700 to 0.78600, saving model to best_models/rcnn/weights.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.78600
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.78600
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.78600
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.78600
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.78600
RCNN, Word Embeddings 0.786


In [46]:
nn_models['rcnn'] = clf_rcnn

In [48]:
score_dict = OrderedDict([( key, score_classifier( y_true=valid_y, y_pred=np.round(model.predict(valid_seq_x)).astype(int).astype(str) ) )  for (key, model) in nn_models.items()])
score_dict

OrderedDict([('cnn', 0.7553395278373403),
             ('rnn_lstm', 0.7388048235505863),
             ('rnnbi', 0.723870132671772),
             ('rcnn', 0.7694402785678578),
             ('cnn_lstm', 0.7711891333273331),
             ('rnn_gru', 0.7594785318039994)])

In [54]:
token_test = text.Tokenizer(lower=False)
# token.fit_on_texts(train_x)
token_test.fit_on_texts(testDF.text)

test_x_seq = (token_test.texts_to_sequences(testDF.text)) 

test_x_seq_pad = sequence.pad_sequences(test_x_seq, maxlen=maxlen)

testDF.label = np.round(nn_models['cnn_lstm'].predict(test_x_seq_pad)).astype(int).astype(str)

Compare the model that performed best to the voting classifier.

In [90]:
print(metrics.classification_report(y_true=valid_y, y_pred=np.round(nn_models['cnn_lstm'].predict(valid_seq_x)).astype(int).astype(str)))

             precision    recall  f1-score   support

          0       0.82      0.85      0.84       627
          1       0.73      0.68      0.71       373

avg / total       0.79      0.79      0.79      1000



In [86]:
# load the model from disk
filename = 'vc_model_small.sav'
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model = loaded_model.fit(train_x, train_y)
result = loaded_model.score(valid_x, valid_y)
print(result)

  if diff:


0.828


  if diff:


In [91]:
print(metrics.classification_report(y_true=valid_y, y_pred=loaded_model.predict(valid_x)))

  if diff:


             precision    recall  f1-score   support

          0       0.84      0.90      0.87       627
          1       0.80      0.71      0.76       373

avg / total       0.83      0.83      0.83      1000



  if diff:


The voting model performed better and I dont have enough time to create an ensemble of all or tune NN models, I will be using the voting classifier to create predictions.