# Text Classification for the Large Data

Since the data is too large, I will be using the parameters I obtained when I optimized the models for the small dataset. It would take too much time to optimize for a dataset this large.

Similarly I start with setting environment variables and imports.

In [1]:
import numpy as np
import os
import GPUtil


Availability=GPUtil.getAvailability(GPUtil.getGPUs())
all_gpus = np.arange(3)
available_gpu_indexes = [x for x in all_gpus if Availability[x]]
# NUMBER_OF_GPUS_TO_USE = len(available_gpu_indexes)
NUMBER_OF_GPUS_TO_USE = 1
# Set CUDA_DEVICE_ORDER so the IDs assigned by CUDA match those from nvidia-smi
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first NUMBER_OF_GPUS_TO_USE available device id
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(np.array(available_gpu_indexes[:NUMBER_OF_GPUS_TO_USE]).astype(str))

In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [14]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15965674896586113992
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11209333146
locality {
  bus_id: 1
  links {
  }
}
incarnation: 8353315950670369382
physical_device_desc: "device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:05:00.0, compute capability: 5.2"
]


In [4]:
import warnings; warnings.filterwarnings('ignore') 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn import metrics
import numpy as np
import pandas as pd
import bz2, glob, os, xgboost, pickle

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import decomposition, ensemble
from collections import OrderedDict
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

In [5]:
def score_classifier(y_true, y_pred):
    return metrics.f1_score(
        y_true, y_pred,
        average='macro', #Calculate metrics for each label, and find their unweighted mean.
        #This does not take label imbalance into account.
    )

In [6]:
train_file = bz2.BZ2File('training-data-large.txt.bz2')

In [7]:
test_file = bz2.BZ2File('test-data-large.txt.bz2')

In [8]:
labels, texts = [],[]
for i, line in enumerate(train_file.readlines(-1)):
    content = line.decode("utf-8").split("\t")
    labels.append(content[0])
    texts.append(content[1].split("\n")[0])
    
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [9]:
texts = []
for i, line in enumerate(test_file.readlines(-1)):
    content = line.decode("utf-8").split("\t")
    texts.append(content[0].split("\n")[0])
    
testDF = pd.DataFrame()
testDF['text'] = texts
testDF['label'] = ""

In [10]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.1)

In [12]:
models = {}

Due to time constraints, I use the parameters I obtained when I optimized for the small dataset and skip parameter optimization.

In [13]:
%%time
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))), 
    ('tfidf', TfidfTransformer(use_idf=False)), 
    ('clf', MultinomialNB(alpha=.1))])
text_clf = text_clf.fit(train_x, train_y)

# Performance of NB Classifier
predicted = text_clf.predict(valid_x)
metrics.accuracy_score(predicted, valid_y)

models['nb'] = text_clf

CPU times: user 2min 48s, sys: 5.05 s, total: 2min 53s
Wall time: 2min 53s


0.82925

the voting classifier does not accept `hinge` loss therefore I am not going to use it in the ensemble. This calculation is to see how it perform compared to others.

In [16]:
%%time
# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm_o = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf-svm', SGDClassifier(loss='hinge', learning_rate='constant', alpha=.0001, eta0=.1, penalty='l2', n_jobs=10))]) 

text_clf_svm_o = text_clf_svm_o.fit(train_x, train_y)
predicted_svm = text_clf_svm_o.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)


# models['svm'] = gs_clf_svm.best_estimator_

CPU times: user 1min 6s, sys: 2.4 s, total: 1min 8s
Wall time: 1min 8s


In [17]:
metrics.accuracy_score(predicted_svm, valid_y)

0.78628

In [19]:
# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf-svm', SGDClassifier(loss='log',penalty='l2',learning_rate='constant',eta0=.1,n_jobs=10))]) 

text_clf_svm = text_clf_svm.fit(train_x, train_y)
predicted_svm = text_clf_svm.predict(valid_x)
metrics.accuracy_score(predicted_svm, valid_y)

models['svm'] = text_clf_svm

0.77962

The creation of `logistic regression` classifier could not complete due to memory error.

In [None]:
text_clf_lr = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf-lr', linear_model.LogisticRegression(solver='newton-cg', C=1., n_jobs=10))])

text_clf_lr = text_clf_lr.fit(train_x, train_y)
predicted_lr = text_clf_lr.predict(valid_x)
metrics.accuracy_score(predicted_lr, valid_y)

models['lr'] = text_clf_lr

In [25]:
text_clf_rf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf-rf', ensemble.RandomForestClassifier(n_estimators =100, max_features='log2', n_jobs=10))])

text_clf_rf = text_clf_rf.fit(train_x, train_y)
predicted_rf = text_clf_rf.predict(valid_x)
metrics.accuracy_score(predicted_rf, valid_y)

models['rf'] = text_clf_rf

0.84519

In [26]:
text_clf_xgb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf-xgb', xgboost.XGBClassifier(n_estimators = 200, max_depth = 3, learning_rate=.1, n_jobs=10))])

text_clf_xgb = text_clf_xgb.fit(train_x, train_y)
predicted_xgb = text_clf_xgb.predict(valid_x)
metrics.accuracy_score(predicted_xgb, valid_y)

models['xgb'] = text_clf_xgb

  if diff:


0.79054

The creation of `K-Nearest neighbors` classifier could not complete due to memory error.

In [None]:
text_clf_knn = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf-knn', KNeighborsClassifier(n_neighbors=5, leaf_size=10, p=1, algorithm='ball_tree', n_jobs=10))])

text_clf_knn = text_clf_knn.fit(train_x, train_y)
predicted_knn = text_clf_knn.predict(valid_x)
metrics.accuracy_score(predicted_knn, valid_y)

models['knn'] = text_clf_knn

It was taking too long, I skip `MLP` as well.

In [None]:
text_clf_mlp = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-mlp', MLPClassifier(hidden_layer_sizes=(512,256), batch_size=5000, early_stopping=True))])

text_clf_mlp = text_clf_mlp.fit(train_x, train_y)
predicted_mlp = text_clf_mlp.predict(valid_x)
metrics.accuracy_score(predicted_mlp, valid_y)

models['mlp'] = text_clf_mlp

Lets see what we got so far.

In [35]:
score_dict = OrderedDict([( key, score_classifier( y_true=valid_y, y_pred=model.predict(valid_x) ) )  for (key, model) in models.items()])
score_dict

  if diff:


OrderedDict([('svm', 0.7617825515018082),
             ('xgb', 0.7836097379768439),
             ('rf', 0.8282508249869445),
             ('nb', 0.8193059610665339)])

In [36]:
vc = VotingClassifier(estimators=list(models.items()))
vc.fit(train_x, train_y)
preds = vc.predict(valid_x)

VotingClassifier(estimators=[('svm', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), pr...e,
         use_idf=False)), ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))]))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

  if diff:
  if diff:


In [37]:
models['vc'] = vc

The ensemble offered increased performance as expected.

In [38]:
score_classifier(y_true=valid_y, y_pred=preds)

0.8405271434195396

In [83]:
print(metrics.classification_report(y_true=valid_y, y_pred=preds))

             precision    recall  f1-score   support

          0       0.84      0.90      0.87       627
          1       0.80      0.71      0.76       373

avg / total       0.83      0.83      0.83      1000



In [84]:
print(metrics.accuracy_score(y_true=valid_y, y_pred=preds))

0.828


In [39]:
# save the model to disk
filename = 'vc_model-large.sav'

In [40]:
pickle.dump(vc, open(filename, 'wb')) 

In [81]:
# load the model from disk
filename = 'vc_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(valid_x, valid_y)
preds = loaded_model.predict(valid_x)
print(result)

  if diff:
  if diff:
  if diff:


0.828


  if diff:


This time I was able to predict all at once.

In [42]:
testDF.label = vc.predict(testDF.text)

  if diff:
  if diff:


In [46]:
testDF.to_pickle('test-data-large-predictions.bz2', compression='bz2')

I skip trying and tuning other NN models and go with the voting classifier due to time constraints.