In [1]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from sklearn.externals.joblib import Parallel, delayed
from sklearn.utils import shuffle

import time
import numpy as np
import scipy.sparse as sparse

import pandas as pd
import re
from io import StringIO

In [2]:
def trainModel(name, name_estimator, clazz, params, dataset, path):
    Xtrain, Ytrain = loadTrainDataSet(name, dataset, path)
    model = clazz(**params)
    start = time.time() # Start time
    model.fit(Xtrain, Ytrain)
    end = time.time()
    elapsed = end - start
    print("\t", name_estimator, " - Time taken for training:", elapsed, "seconds")
    return (name_estimator, model)

def loadTrainDataSet(name, dataset, path):
    train = sparse.load_npz(path+dataset)
    #train = shuffle(train)
    Xtrain = train[:,:-1]
    Ytrain = train[:,-1].A.reshape(train[:,-1].A.shape[0],)
    print("Train dataset loaded: "+name)
    return Xtrain, Ytrain

def simple_voting(lista):
    return np.argmax(np.bincount(lista))

def report_to_df(dataset, model, report):
    report = "Classes" + re.sub(r"( +)|(avg / total(.*))", " ", report).replace("\n ", "\n")
    report_df = pd.read_csv(StringIO(report), sep=' ')
    report_df["DataSet"] = dataset
    report_df["Model"] = model
    report_df.set_index('DataSet', inplace=True)
    return(report_df[['Model','Classes','precision','recall','f1-score','support']])

In [3]:
estimators = {"KNeighbors": (KNeighborsClassifier, {}),
              "MultinomialNB" : (MultinomialNB, {}),
              "RandomForest" : (RandomForestClassifier, {"n_estimators":100}),
              "LogisticRegression" : (LogisticRegression, {}),
              "MLP" : (MLPClassifier, {"hidden_layer_sizes":100}),
              #"MLP" : (MLPClassifier, {"hidden_layer_sizes":100, "max_iter":1500}),
              "SVM" : (SVC, {"cache_size":1000}),
              "LinearSVC" : (LinearSVC, {})
             }

datasets = {"Sin Balancear": "centroids_train.npz",
            "Kmeans": "data_temp.npz",
            "Kmeans+Adasyn": "train_1.npz",
            "Kmeans+Smote": "train_3.npz",
            "Kmeans+SmoteEnn": "train_4.npz",
            "Kmeans+SmoteTomek": "train_5.npz",
            "Kmeans+AdasynEnn": "train_6.npz",
            "Kmeans+AdasynTomek": "train_9.npz"
           }

In [4]:
path_train = "../hufa_train_wiki_w2v/"
path_test = "../hufa_test_wiki_w2v/centroids_test.npz"

test = sparse.load_npz(path_test).astype(float)
#test = shuffle(test)
Xtest = test[:,:-1]
Ytest = test[:,-1].A.reshape(test[:,-1].A.shape[0],)
#del test
print("Test dataset loaded ")

Test dataset loaded 


In [5]:
runs = []
for (name, dataset) in datasets.items():
    #Xtrain, Ytrain, name = loadTrainDataSet(name, dataset, path_train)
    #print("Train dataset loaded: "+name)
    models = Parallel(n_jobs=len(estimators))(delayed(trainModel)(name, name_estimator, clazz, params, dataset, path_train) for (name_estimator, (clazz, params)) in estimators.items())
    runs.append({"name":name, "models":models})

Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
Train dataset loaded: Kmeans+AdasynTomek
	 MultinomialNB  - Time taken for training: 0.05394482612609863 seconds
	 KNeighbors  - Time taken for training: 0.03136897087097168 seconds
	 LinearSVC  - Time taken for training: 14.238010883331299 seconds
	 LogisticRegression  - Time taken for training: 15.202873945236206 seconds
	 RandomForest  - Time taken for training: 18.905710220336914 seconds
	 SVM  - Time taken for training: 339.05274748802185 seconds
	 MLP  - Time taken for training: 612.7561783790588 seconds
Train dataset loaded: Kmeans+SmoteTomek
Train dataset loaded: Kmeans+SmoteTomek
Train dataset loaded: Kmeans+SmoteTomek
Train dataset loaded: Kmeans+SmoteTomek
Train dataset loaded: Kmeans+SmoteTomek
Train dataset loaded: Kmeans+SmoteTome

In [312]:
print("Obteniendo resultados:")
df_results = pd.DataFrame()
for run in runs:
    print ("\n\nDATASET: ", run["name"], "\n\n")
    results = []
    for (name_estimator, model) in run["models"]:
        start = time.time() # Start time
        if name_estimator == "KNeighbors":
            result = [y for x in [Xtest[i:i+5000,:] for i in range(0,Xtest.shape[0],5000)] for y in model.predict(x)]
        else:
            result = model.predict(Xtest)
        end = time.time()
        elapsed = end - start
        results.append(list(result))
        print("DataSet: ", run["name"]," ----- Modelo: ", name_estimator, " ----- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result, digits=4), "\n")
        df_results = pd.concat([df_results, report_to_df(run["name"], name_estimator, classification_report(Ytest, result, digits=4))])
    start = time.time() # Start time
    result_voting = [simple_voting([x[i] for x in results]) for i in range(0, len(results[0]))]
    df_results = pd.concat([df_results, report_to_df(run["name"], name_estimator, classification_report(Ytest, result_voting, digits=4))])
    end = time.time()
    elapsed = end - start
    print("DataSet: ", run["name"]," ----- Modelo: Voting ----- Time taken for prediction:", elapsed,"seconds\n", classification_report(Ytest, result_voting, digits=4), "\n")

Obteniendo resultados:


DATASET:  Kmeans+AdasynTomek 


DataSet:  Kmeans+AdasynTomek  ----- Modelo:  MultinomialNB  ----- Time taken for prediction: 0.021590709686279297 seconds
              precision    recall  f1-score   support

        0.0     0.9997    0.8274    0.9054     54473
        1.0     0.0494    0.9722    0.0941       503

avg / total     0.9910    0.8287    0.8980     54976
 

DataSet:  Kmeans+AdasynTomek  ----- Modelo:  RandomForest  ----- Time taken for prediction: 1.9457101821899414 seconds
              precision    recall  f1-score   support

        0.0     0.9992    1.0000    0.9996     54473
        1.0     0.9957    0.9125    0.9523       503

avg / total     0.9992    0.9992    0.9991     54976
 

DataSet:  Kmeans+AdasynTomek  ----- Modelo:  SVM  ----- Time taken for prediction: 609.8878281116486 seconds
              precision    recall  f1-score   support

        0.0     0.9994    0.9317    0.9643     54473
        1.0     0.1125    0.9384    0.2010       

DataSet:  Sin Balancear  ----- Modelo:  RandomForest  ----- Time taken for prediction: 2.4691336154937744 seconds
              precision    recall  f1-score   support

        0.0     0.9987    1.0000    0.9993     54473
        1.0     1.0000    0.8549    0.9218       503

avg / total     0.9987    0.9987    0.9986     54976
 

DataSet:  Sin Balancear  ----- Modelo:  SVM  ----- Time taken for prediction: 111.48017978668213 seconds
              precision    recall  f1-score   support

        0.0     0.9943    1.0000    0.9971     54473
        1.0     0.9948    0.3797    0.5496       503

avg / total     0.9943    0.9943    0.9930     54976
 

DataSet:  Sin Balancear  ----- Modelo:  LinearSVC  ----- Time taken for prediction: 0.013540506362915039 seconds
              precision    recall  f1-score   support

        0.0     0.9993    0.9994    0.9994     54473
        1.0     0.9391    0.9205    0.9297       503

avg / total     0.9987    0.9987    0.9987     54976
 

DataSet:  Sin 

DataSet:  Kmeans+SmoteEnn  ----- Modelo:  SVM  ----- Time taken for prediction: 188.9189908504486 seconds
              precision    recall  f1-score   support

        0.0     0.9991    0.9997    0.9994     54473
        1.0     0.9640    0.9046    0.9333       503

avg / total     0.9988    0.9988    0.9988     54976
 

DataSet:  Kmeans+SmoteEnn  ----- Modelo:  LinearSVC  ----- Time taken for prediction: 0.01013493537902832 seconds
              precision    recall  f1-score   support

        0.0     0.9991    0.9999    0.9995     54473
        1.0     0.9913    0.9066    0.9470       503

avg / total     0.9991    0.9991    0.9991     54976
 

DataSet:  Kmeans+SmoteEnn  ----- Modelo:  MLP  ----- Time taken for prediction: 0.37743425369262695 seconds
              precision    recall  f1-score   support

        0.0     0.9991    1.0000    0.9995     54473
        1.0     0.9956    0.9046    0.9479       503

avg / total     0.9991    0.9991    0.9991     54976
 

DataSet:  Kmeans+S

In [313]:
df_results.to_csv("../Resultados/BoC_Wiki.csv", float_format='%.4f')

In [329]:
resultado = runs[2]['models'][1][1].predict(Xtest)

In [330]:
confusion_matrix(Ytest, resultado)

array([[54473,     0],
       [   43,   460]])

In [331]:
diferencias = resultado == Ytest

In [342]:
np.where(diferencias == False)[0]

array([54473, 54478, 54482, 54501, 54508, 54509, 54532, 54539, 54558,
       54562, 54580, 54585, 54617, 54632, 54637, 54639, 54641, 54643,
       54657, 54696, 54698, 54715, 54730, 54746, 54767, 54769, 54770,
       54774, 54791, 54814, 54827, 54830, 54838, 54841, 54844, 54848,
       54852, 54855, 54876, 54917, 54926, 54928, 54932])

In [335]:
mini_path_test = "../hufa_test_wiki_w2v/minitest_centroids_test.npz"

test = sparse.load_npz(mini_path_test).astype(float)
#test = shuffle(test)
Xtest = test[:,:-1]
Ytest = test[:,-1].A.reshape(test[:,-1].A.shape[0],)

In [337]:
mini_resultado = runs[2]['models'][1][1].predict(Xtest)

In [339]:
confusion_matrix(Ytest, mini_resultado)

array([[ 0,  0],
       [43,  0]])

In [341]:
tn, fp, fn, tp = confusion_matrix(Ytest, mini_resultado).ravel()
(tn, fp, fn, tp)

(0, 0, 43, 0)