In [1]:
import pandas
import csv
import re
import unicodedata
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem import SnowballStemmer
import time
import scipy.sparse
import warnings

#classifiers
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

#graphs
%matplotlib inline
import matplotlib.pyplot as plt

#parallel
from sklearn.externals.joblib import Parallel, delayed

## Loading data into memory
In the following section, we load the training and test data

In [2]:
df_train = pandas.read_csv('./data/train', sep='\t', index_col=0)
df_test = pandas.read_csv('./data/test', sep='\t', index_col=0)

Xtrain=df_train['text']
Ytrain=df_train['label']

Xtest=df_test['text']
Ytest=df_test['label']
print('datasets were loaded', len(df_train), len(df_test))

datasets were loaded 164926 54976


Next, we define "split_into_tokens", function to process the text giving as result a list of tokens where the following steps have been made:
<li> Accents are removed
<li> Non-alphanumeric characters are filtered
<li> Shift to lower case and split text in tokens
<li> Deleted stopwords and replacement of the remaining words by their root (stemming)

In [9]:
def split_into_tokens(text):
    #stemmer = SnowballStemmer('spanish')
    stemmer = None
    min_length = 3
    # 1. Remove accent marks
    review_text = ''.join((c for c in unicodedata.normalize('NFD',str(text)) if unicodedata.category(c) != 'Mn'))
    #
    # 2. Remove non-alphanumeric
    #letters_only = re.sub("[^A-Za-z0-9]", " ", review_text) 
    letters_only = re.sub("[^\w\d]", " ", review_text) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))                  
    # 
    # 4. Remove stop words and apply or not stemming
    if stemmer:
        filtered_tokens = [stemmer.stem(w) for w in words if not w in stops and len(w)>=min_length]
    else:
        filtered_tokens = [w for w in words if not w in stops and len(w)>=min_length]
    #
    # 5. return the result
    return filtered_tokens

The "trainModel" function receives the following arguments: the name of the classification algorithm, the class, its parameters, and the data sets. This function trains the model and returns a tuple with the name of the algorithm and the model already trained.

In [10]:
def trainModel(name, clazz, params, Xtrain, Ytrain):
    print("training ", name)
    model = clazz(**params)
    start = time.time() # Start time
    model.fit(Xtrain, Ytrain)
    end = time.time()
    elapsed = end - start
    print("-> done ", name, " - Time taken for training:", elapsed, "seconds")
    return (name, model)

## "Classical" algorithms

In this section, we create a dictionary that contains all the necessary data of the classification algorithms that are going to be used, as well as the parameters of each one of them. (In case the parameters are not indicated, those configured by default in scikit-learn are used).

In [11]:
estimators = {"KNeighbors": (KNeighborsClassifier, {}),
              "MultinomialNB" : (MultinomialNB, {}),
              "RandomForest" : (RandomForestClassifier, {"n_estimators":100}),
              "LogisticRegression" : (LogisticRegression, {}),
              "MLP" : (MLPClassifier, {"hidden_layer_sizes":100}),
              "SVM" : (SVC, {"cache_size":1000}),
              "LinearSVC" : (LinearSVC, {})
             }

## Bag of words  (unbalanced, with all "classic" algorithms)


In this section, training and test data are transformed into a bag of words, going from a set of tokens to a set of occurrences per token.

In [4]:
bow = CountVectorizer(analyzer=split_into_tokens)

print("Creando matriz de bolsa de palabras...")

%time bow.fit(Xtrain, Ytrain)
%time Xtrain_bow = bow.transform(Xtrain)
%time Xtest_bow = bow.transform(Xtest)

scipy.sparse.save_npz('data/Xtrain_bow.npz', Xtrain_bow)
scipy.sparse.save_npz('data/Xtest_bow.npz', Xtest_bow)

Creando matriz de bolsa de palabras...
CPU times: user 8min 42s, sys: 1.4 s, total: 8min 43s
Wall time: 8min 43s
CPU times: user 8min 43s, sys: 1.25 s, total: 8min 44s
Wall time: 8min 44s
CPU times: user 2min 53s, sys: 448 ms, total: 2min 53s
Wall time: 2min 53s


This section, loads in memory the bags of words. Use only if you have previously obtained the bags of words and do not have them loaded into memory.

In [12]:
# Xtest_bow = scipy.sparse.load_npz('data/Xtest_bow.npz').astype(np.int16, casting='same_kind')
# Xtrain_bow = scipy.sparse.load_npz('data/Xtrain_bow.npz').astype(np.int16, casting='same_kind')

Next, we create as many processes as we have estimators. These processes will be in charge of training the different models in parallel.<br> 
As a result, we get a list of tuples formed by: (the name of the algorithm, the model already trained).

In [7]:
models = Parallel(n_jobs=len(estimators))(delayed(trainModel)(name, clazz, params, Xtrain_bow, Ytrain) for (name, (clazz, params)) in estimators.items())

training  KNeighbors
-> done  KNeighbors  - Time taken for training: 0.05667233467102051 seconds
training  MultinomialNB
-> done  MultinomialNB  - Time taken for training: 0.1639249324798584 seconds
training  MLP
training  RandomForest
training  LogisticRegression
training  LinearSVC
training  SVM
-> done  LinearSVC  - Time taken for training: 2.9192557334899902 seconds
-> done  LogisticRegression  - Time taken for training: 17.37110424041748 seconds
-> done  RandomForest  - Time taken for training: 52.50565981864929 seconds
-> done  SVM  - Time taken for training: 562.716991186142 seconds
-> done  MLP  - Time taken for training: 2464.838773727417 seconds


Once the models are trained, we obtain the labels from the test set and evaluate the results of each of them. 

In [8]:
print("Obteniendo resultados:")
for (name, model) in models:
    start = time.time() # Start time
    if name == "KNeighbors":
        result = [y for x in [Xtest_bow[i:i+5000,:] for i in range(0,Xtest_bow.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_bow)
    end = time.time()
    elapsed = end - start
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")

Obteniendo resultados:
---------- Modelo:  KNeighbors  ---------- Time taken for prediction: 661.5408456325531 seconds
              precision    recall  f1-score   support

      False     0.9987    1.0000    0.9994     54473
       True     1.0000    0.8628    0.9264       503

avg / total     0.9987    0.9987    0.9987     54976
 

---------- Modelo:  MultinomialNB  ---------- Time taken for prediction: 0.11231493949890137 seconds
              precision    recall  f1-score   support

      False     0.9992    0.9976    0.9984     54473
       True     0.7761    0.9165    0.8405       503

avg / total     0.9972    0.9968    0.9969     54976
 

---------- Modelo:  MLP  ---------- Time taken for prediction: 0.4970865249633789 seconds
              precision    recall  f1-score   support

      False     0.9994    0.9996    0.9995     54473
       True     0.9551    0.9304    0.9426       503

avg / total     0.9990    0.9990    0.9990     54976
 

---------- Modelo:  RandomForest  --

## Tf-idf model (unbalanced, with all "classic" algorithms)

To compare with the results obtained, we will use the representation of the training and test data in Tf-idf format. To achieve this, the bag of words is transformed into the frequency of occurrence of terms in the collection of documents.<br>
The process that is carried out, from this point on, is the same as the employee with the bag of words previously.

In [7]:
tfidf = TfidfTransformer()

print("Creando matriz de tf-idf...")

%time tfidf.fit(Xtrain_bow, Ytrain)
%time Xtrain_tfidf = tfidf.transform(Xtrain_bow)
%time Xtest_tfidf = tfidf.transform(Xtest_bow)

scipy.sparse.save_npz('data/Xtrain_tfidf.npz', Xtrain_tfidf)
scipy.sparse.save_npz('data/Xtest_tfidf.npz', Xtest_tfidf)

Creando matriz de tf-idf...
CPU times: user 64 ms, sys: 24 ms, total: 88 ms
Wall time: 85.9 ms
CPU times: user 412 ms, sys: 140 ms, total: 552 ms
Wall time: 549 ms
CPU times: user 144 ms, sys: 32 ms, total: 176 ms
Wall time: 175 ms


This section, loads in memory the bags of words. Use only if you have previously obtained the bags of words and do not have them loaded into memory.

In [16]:
# Xtest_tfidf = scipy.sparse.load_npz('data/Xtest_tfidf.npz').astype(np.float32)
# Xtrain_tfidf = scipy.sparse.load_npz('data/Xtrain_tfidf.npz').astype(np.float32)

In [17]:
Xtrain_tfidf

<164926x140425 sparse matrix of type '<class 'numpy.float32'>'
	with 18552095 stored elements in Compressed Sparse Row format>

Next, we create as many processes as we have estimators. These processes will be in charge of training the different models in parallel.<br> 
As a result, we get a list of tuples formed by: (the name of the algorithm, the model already trained).

In [10]:
models_tfidf = Parallel(n_jobs=len(estimators))(delayed(trainModel)(name, clazz, params, Xtrain_tfidf, Ytrain) for (name, (clazz, params)) in estimators.items())

training  KNeighbors
training  MultinomialNB
-> done  KNeighbors  - Time taken for training: 0.29846620559692383 seconds
-> done  MultinomialNB  - Time taken for training: 0.17311930656433105 seconds
training  MLP
training  RandomForest
training  LogisticRegression
training  LinearSVC
training  SVM
-> done  LinearSVC  - Time taken for training: 1.6419150829315186 seconds
-> done  LogisticRegression  - Time taken for training: 5.75917911529541 seconds
-> done  RandomForest  - Time taken for training: 58.53112745285034 seconds
-> done  SVM  - Time taken for training: 429.11979126930237 seconds
-> done  MLP  - Time taken for training: 4001.9736495018005 seconds


Once the models are trained, we obtain the labels from the test set and evaluate the results of each of them. 

In [11]:
print("Obteniendo resultados:")
for (name, model) in models_tfidf:
    start = time.time() # Start time
    if name == "KNeighbors":
        result = [y for x in [Xtest_tfidf[i:i+5000,:] for i in range(0,Xtest_tfidf.shape[0],5000)] for y in model.predict(x)]
    else:
        result = model.predict(Xtest_tfidf)
    end = time.time()
    elapsed = end - start
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")

Obteniendo resultados:
---------- Modelo:  KNeighbors  ---------- Time taken for prediction: 632.7096681594849 seconds
              precision    recall  f1-score   support

      False     0.9977    1.0000    0.9988     54473
       True     1.0000    0.7475    0.8555       503

avg / total     0.9977    0.9977    0.9975     54976
 

---------- Modelo:  MultinomialNB  ---------- Time taken for prediction: 0.03978276252746582 seconds
              precision    recall  f1-score   support

      False     0.9922    1.0000    0.9961     54473
       True     1.0000    0.1451    0.2535       503

avg / total     0.9922    0.9922    0.9893     54976
 

---------- Modelo:  MLP  ---------- Time taken for prediction: 0.5144734382629395 seconds
              precision    recall  f1-score   support

      False     0.9994    0.9997    0.9995     54473
       True     0.9670    0.9324    0.9494       503

avg / total     0.9991    0.9991    0.9991     54976
 

---------- Modelo:  RandomForest  --

  'precision', 'predicted', average, warn_for)


## Voting (Bag of Words)

In [12]:
ensemble = VotingClassifier(models, n_jobs=-1)
start = time.time() # Start time
voting_model_bow=ensemble.fit(Xtrain_bow,Ytrain)
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for training:", elapsed, "seconds")

VotingClassifier - Time taken for training: 2449.9408764839172 seconds


In [13]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
start = time.time() # Start time
predictions1 = [y for x in [Xtest_bow[i:i+2000,:] for i in range(0,Xtest_bow.shape[0],2000)] for y in voting_model_bow.predict(x)]
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for prediction:", elapsed, "seconds")
cr1=classification_report(Ytest, predictions1, digits=4)
print(cr1)

VotingClassifier - Time taken for prediction: 800.65008020401 seconds
             precision    recall  f1-score   support

      False     0.9992    0.9999    0.9996     54473
       True     0.9935    0.9085    0.9491       503

avg / total     0.9991    0.9991    0.9991     54976



## Voting (Tf-idf)

In [14]:
ensemble = VotingClassifier(models_tfidf, n_jobs=-1)
start = time.time() # Start time
voting_model_tfidf=ensemble.fit(Xtrain_tfidf,Ytrain)
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for training:", elapsed, "seconds")

VotingClassifier - Time taken for training: 3767.6675279140472 seconds


In [15]:
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
start = time.time() # Start time
predictions2 = [y for x in [Xtest_tfidf[i:i+2000,:] for i in range(0,Xtest_tfidf.shape[0],2000)] for y in voting_model_tfidf.predict(x)]
end = time.time()
elapsed = end - start
print("VotingClassifier - Time taken for prediction:", elapsed, "seconds")
cr2=classification_report(Ytest, predictions2, digits=4)
print(cr2)

VotingClassifier - Time taken for prediction: 731.4793696403503 seconds
             precision    recall  f1-score   support

      False     0.9988    1.0000    0.9994     54473
       True     1.0000    0.8728    0.9321       503

avg / total     0.9988    0.9988    0.9988     54976

