In [1]:
import pandas as pd
import nltk.data
import _pickle as cPickle
import time
import sys
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem import SnowballStemmer
import unicodedata
import re
from sklearn.preprocessing import MinMaxScaler

#parallel
from sklearn.externals.joblib import Parallel, delayed

#classifiers
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

## FUNCTIONS

Next, we define "split_into_tokens", function to process the text giving as result a list of tokens where the following steps have been made:
<li> Accents are removed
<li> Non-alphanumeric characters are filtered
<li> Shift to lower case and split text in tokens
<li> Deleted stopwords and replacement of the remaining words by their root (stemming)

In [2]:
def review_to_wordlist(raw_review, stemmer=False):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove non-letters
    #letters_only = re.sub("[^A-Za-z0-9]", " ", review_text) 
    letters_only = re.sub("[^\w\d]", " ", raw_review) 
    #
    # 2. Split into individual words
    #### Para este modelo W2V no modificamos las mayúsculas
    words = letters_only.split()
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))                  
    # 
    # 4. Remove stop words and apply or not stemming
    if stemmer:
        meaningful_words = [stemmer.stem(w) for w in words if not w in stops]
    else:
        # "re.sub("^\d+$", "DIGITO", w) Change all numbers with the token “DIGITO”
        meaningful_words = [re.sub("^\d+$", "DIGITO", w) for w in words if not w in stops]
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))
    return meaningful_words

Function in charge of loading into memory a W2V model of an indicated route.

In [3]:
def load_W2V_model(path):
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    print("Loaded W2V model")
    return model

Function in charge of cleaning the full text, returning a list of lists

In [4]:
def makeclearlist(text):
    stemmer = SnowballStemmer('spanish')
    print("Limpiando texto. shape:", text.shape)
    clean_text = [review_to_wordlist( review ) for review in text]
    print("Texto limpio. shape:", text.shape)
    return clean_text

The "trainModel" function receives the following arguments: the name of the classification algorithm, the class, its parameters, and the data sets. This function trains the model and returns a tuple with the name of the algorithm and the model already trained.

In [5]:
def trainModel(name, clazz, params, Xtrain, Xtrain_sca, Ytrain):
    print("training ", name)
    model = clazz(**params)
    start = time.time() # Start time
    if name == "MultinomialNB":
        model.fit(Xtrain_sca, Ytrain)
    else:
        model.fit(Xtrain, Ytrain)
    end = time.time()
    elapsed = end - start
    print("-> done ", name, " - Time taken for training:", elapsed, "seconds")
    return (name, model)

The "makeFeatureVec" and "getAvgFeatureVec" functions are in charge of transforming each of the reviews into a vector of length "num_features".<br>.
The resulting vector contains the average of all the vectors that represent each word contained in the vocabulary of the W2V model.

In [6]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    if nwords != 0:
        featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    print ("sub_clean_reviews len:", len(reviews))
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%20000 == 0:
           print ("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

In [22]:
def simple_voting(lista):
    return np.argmax(np.bincount(lista))

## Loading data into memory
In the following section, we load the training and test data

In [7]:
df_train = pd.read_csv('./data/train', sep='\t', index_col=0)
df_test = pd.read_csv('./data/test', sep='\t', index_col=0)

Xtrain=df_train['text']
Ytrain=df_train['label']

Xtest=df_test['text']
Ytest=df_test['label']
print('datasets were loaded', len(df_train), len(df_test))

num_features = 300
path_W2V = "W2V/sbw_vectors.bin"
model_W2V = load_W2V_model(path_W2V)

datasets were loaded 164926 54976
Loaded W2V model


## "Classical" algorithms
In this section, we create a dictionary that contains all the necessary data of the classification algorithms that are going to be used, as well as the parameters of each one of them. (In case the parameters are not indicated, those configured by default in scikit-learn are used).

In [8]:
estimators = {"KNeighbors": (KNeighborsClassifier, {}),
              "MultinomialNB" : (MultinomialNB, {}),
              "RandomForest" : (RandomForestClassifier, {"n_estimators":100}),
              "LogisticRegression" : (LogisticRegression, {}),
              "MLP" : (MLPClassifier, {"hidden_layer_sizes":100}),
              "SVM" : (SVC, {"cache_size":1000}),
              "LinearSVC" : (LinearSVC, {})
             }

## Generating data set formed by the mean of the vectors
<li> Clean the original text, both the train and the test
<li> Convert the text, once cleaned, to the final sets of test and train composed by the average of the vectors.

In [9]:
clean_reviews = Parallel(n_jobs=2)(delayed(makeclearlist)(text) for text in [Xtrain, Xtest])

Limpiando texto. shape: (164926,)
Limpiando texto. shape: (54976,)
Texto limpio. shape: (54976,)
Texto limpio. shape: (164926,)


In [13]:
DataVecs = Parallel(n_jobs=2)(delayed(getAvgFeatureVecs)(sub_clean_reviews, model_W2V, num_features) for sub_clean_reviews in clean_reviews)

sub_clean_reviews len: 164926
Review 0 of 164926
sub_clean_reviews len: 54976
Review 0 of 54976
Review 20000 of 164926
Review 20000 of 54976
Review 40000 of 164926
Review 40000 of 54976
Review 60000 of 164926
Review 80000 of 164926
Review 100000 of 164926
Review 120000 of 164926
Review 140000 of 164926
Review 160000 of 164926


In [16]:
if DataVecs[0].shape[0] > DataVecs[1].shape[0]:
    Xtrain_Avg = DataVecs[0]
    Xtest_Avg = DataVecs[1]
else:
    Xtrain_Avg = DataVecs[1]
    Xtest_Avg = DataVecs[0]
Xtrain_Avg.shape, Xtest_Avg.shape

# MultinomialNB: Input must be non-negative
# To handle this problem, one of the possible solutions is to normalize all the data first
Scaler = MinMaxScaler().fit(np.concatenate((Xtrain_Avg, Xtest_Avg), axis=0))
Xtrain_Avg_sca = Scaler.transform(Xtrain_Avg)
Xtest_Avg_sca = Scaler.transform(Xtest_Avg)

## Training of classification models
Next, we create as many processes as we have estimators. These processes will be in charge of training the different models in parallel.<br> 
As a result, we get a list of tuples formed by: (the name of the algorithm, the model already trained).

In [17]:
models = Parallel(n_jobs=len(estimators))(delayed(trainModel)(name, clazz, params, Xtrain_Avg, Xtrain_Avg_sca, Ytrain) for (name, (clazz, params)) in estimators.items())

training  MLP
training  MultinomialNB
-> done  MultinomialNB  - Time taken for training: 0.3875546455383301 seconds
training  KNeighbors
training  SVM
training  LogisticRegression
training  RandomForest
training  LinearSVC
-> done  KNeighbors  - Time taken for training: 10.216129779815674 seconds
-> done  LinearSVC  - Time taken for training: 8.482576131820679 seconds
-> done  LogisticRegression  - Time taken for training: 13.969268798828125 seconds
-> done  MLP  - Time taken for training: 143.64605951309204 seconds
-> done  SVM  - Time taken for training: 288.40018463134766 seconds
-> done  RandomForest  - Time taken for training: 328.87237668037415 seconds


Once the models are trained, we obtain the labels from the test set and evaluate the results of each of them. 

In [20]:
print("Obteniendo resultados:")
results = []
for (name, model) in models:
    start = time.time() # Start time
    if name == "KNeighbors":
        result = [y for x in [Xtest_Avg[i:i+5000,:] for i in range(0,Xtest_Avg.shape[0],5000)] for y in model.predict(x)]
    elif name == "MultinomialNB":
        result = model.predict(Xtest_Avg_sca)
    else:
        result = model.predict(Xtest_Avg)
    end = time.time()
    elapsed = end - start
    results.append(list(result))
    print("---------- Modelo: ", name, " ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
result = None

Obteniendo resultados:
---------- Modelo:  MLP  ---------- Time taken for prediction: 2.2293083667755127 seconds
              precision    recall  f1-score   support

      False     0.9992    0.9994    0.9993     54473
       True     0.9366    0.9105    0.9234       503

avg / total     0.9986    0.9986    0.9986     54976
 

---------- Modelo:  MultinomialNB  ---------- Time taken for prediction: 0.07517886161804199 seconds
              precision    recall  f1-score   support

      False     0.9909    1.0000    0.9954     54473
       True     0.0000    0.0000    0.0000       503

avg / total     0.9818    0.9909    0.9863     54976
 



  'precision', 'predicted', average, warn_for)


---------- Modelo:  KNeighbors  ---------- Time taken for prediction: 3050.907576084137 seconds
              precision    recall  f1-score   support

      False     0.9991    0.9997    0.9994     54473
       True     0.9598    0.9026    0.9303       503

avg / total     0.9987    0.9988    0.9987     54976
 

---------- Modelo:  SVM  ---------- Time taken for prediction: 57.52249622344971 seconds
              precision    recall  f1-score   support

      False     0.9909    1.0000    0.9954     54473
       True     0.0000    0.0000    0.0000       503

avg / total     0.9818    0.9909    0.9863     54976
 

---------- Modelo:  LogisticRegression  ---------- Time taken for prediction: 0.052895545959472656 seconds
              precision    recall  f1-score   support

      False     0.9979    0.9998    0.9989     54473
       True     0.9678    0.7773    0.8622       503

avg / total     0.9977    0.9977    0.9976     54976
 

---------- Modelo:  RandomForest  ---------- Time take

In [23]:
start = time.time() # Start time
result_voting = [simple_voting([x[i] for x in results]) for i in range(0, len(results[0]))]
end = time.time()
elapsed = end - start
print("---------- Modelo: Voting ---------- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result_voting, digits=4), "\n")

---------- Modelo: Voting ---------- Time taken for prediction: 0.2185978889465332 seconds
              precision    recall  f1-score   support

      False     0.9991    1.0000    0.9995     54473
       True     0.9978    0.8986    0.9456       503

avg / total     0.9991    0.9991    0.9990     54976
 

