<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Preliminares</div></H1>

In [1]:
import pandas as pd
import numpy as np
import csv
import re
import h2_funciones_auxiliares as faux

# Para preprocesar datos y realizar wordcloud.
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
from string import punctuation

# Para implementar modelos de machine learning.
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_curve
from sklearn import set_config
set_config(display = 'diagram')

# Para recodificación de atributos
from category_encoders import TargetEncoder, OrdinalEncoder, OneHotEncoder

from datetime import datetime

# Para visualización
import graphviz

# Para serialización
import pickle

# Para evitar mensajes de deprecación
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# FUNCIÓN PARA OBTENER CLASSIFICATION REPORT:
def report(y_train, y_pred_train, y_test, y_pred):
    print("Classification report - TRAIN SET")
    print(classification_report(y_train, y_pred_train))
    print("\nClassification report - TEST SET")
    print(classification_report(y_test, y_pred))

# FUNCIÓN PARA CLASSIFICATION REPORT - SÓLO TEST SET
def test_classification_report(nombre_modelo, y_test, y_hat):
    print("\nTEST SET - Classification report - {}".format(nombre_modelo.upper()) )
    print(classification_report(y_test, y_hat))

In [3]:
df = pd.read_csv('mod_ripley.csv').drop(columns = 'comentario').dropna()

In [4]:
df = df.drop(df[df['rating'] == 0].index)

In [5]:
df.sample(2)

Unnamed: 0,retail,categoria,producto,precio_original,precio_internet,precio_oferta,rating,comentario_sin_re
309,ripley,ropa,SWEATER MG BOUTIQUE FANTASIA ROMBOS,29990,14990,0,5,excelente precioso y calce perfecto recomendable
4578,ripley,hogar,ZAPATERO ORGANIZADOR RACK 30 PARES,16900,8990,0,1,malo malisimo ni hagan ni tal de comprarlo te...


In [6]:
df = df.loc[:,'rating':'comentario_sin_re']

In [7]:
df = df.drop(df[df['comentario_sin_re'] == np.nan].index)

In [8]:
df.shape

(4538, 2)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>CountVectorizer</div></H1>

In [9]:
sw = set(stopwords.words('spanish') + list(punctuation)+ ['mas', 'si','ve','asi','menos','igual','ser','tal','aun',
                                                          'viene','ahora','puede','parte','todas'])

In [10]:
pr = CountVectorizer(analyzer = 'word',
                    stop_words = sw, 
                    min_df = 1,
                    ngram_range =(1,3),
                    lowercase = True)

In [11]:
cv_fit = pr.fit_transform(df['comentario_sin_re'])

In [12]:
#extraer palabras
wrds = pr.get_feature_names()
wrds_freq = cv_fit.toarray().sum(axis=0)
df_wrd = pd.DataFrame({'words':wrds, 'frecuencia':wrds_freq}).sort_values(by='frecuencia', ascending=False)
df_wrd[:20].T

Unnamed: 0,52504,9817,27249,8107,7714,6122,62923,15420,62205,56380,8144,38142,13413,24449,64133,7922,14890,51310,29740,8745
words,producto,calidad,excelente,buena,buen,bien,talla,compre,super,recomiendo,buena calidad,lindo,color,encanto,tela,buen producto,compra,precio,foto,bueno
frecuencia,885,712,709,629,517,498,494,447,352,341,306,295,287,283,267,252,247,227,224,224


<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Matriz de atributos y vector objetivo</div></H1>

In [13]:
# Matriz de atributos
X = df['comentario_sin_re'] #Sólo columna con los comentarios

# Vector objetivo
y = df['rating']

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>División de la muestra</div></H1>

In [14]:
# Proporción 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 123)

In [15]:
# Proporción 65/35
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size = 0.35, random_state = 123)

In [16]:
# Proporción 60/40
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y,  test_size = 0.3, random_state = 123)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Definiciones pasos Pipeline</div></H1>

In [17]:
var_vec = 'comentario_sin_re'

In [18]:
vectorizada = Pipeline(steps = [("pr_vec", pr)])

In [19]:
prep = ColumnTransformer(transformers = [("vec",vectorizada, var_vec)])

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Modelos a implementar</div></H1>

In [20]:
# Declaración de modelos
modelo_svc = LinearSVC(random_state = 123)
modelo_mnb = MultinomialNB()
modelo_rf = RandomForestClassifier(random_state = 123)
modelo_dtc = DecisionTreeClassifier(random_state= 123)
modelo_abc = AdaBoostClassifier(random_state = 123)
modelo_gbc = GradientBoostingClassifier(random_state = 123)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 70/30</div></H1>

### 1. LinearSVC

In [21]:
pipe_1 = Pipeline(steps = [
    ("prep", pr),
    ("modelo_1", modelo_svc)
])

In [22]:
parametros_svc = {'modelo_1__C':[0.01,0.1,1,10, 100, 1000],
                  'modelo_1__max_iter': [1000,10000]}

In [23]:
%%time
search_svc = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 1.15 s, sys: 98.8 ms, total: 1.25 s
Wall time: 1min 6s


In [24]:
search_svc.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [25]:
#predicción
y_pred_svc = search_svc.best_estimator_.predict(X_test)
y_pred_train_svc = search_svc.best_estimator_.predict(X_train)

In [26]:
report(y_train,y_pred_train_svc,y_test,y_pred_svc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       429
           2       1.00      0.98      0.99       125
           3       1.00      0.94      0.97       163
           4       1.00      0.90      0.95       298
           5       0.97      1.00      0.99      1707

    accuracy                           0.98      2722
   macro avg       0.99      0.96      0.98      2722
weighted avg       0.98      0.98      0.98      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.80      0.71      0.75       305
           2       0.50      0.03      0.05        76
           3       0.50      0.08      0.14       109
           4       0.42      0.11      0.17       225
           5       0.74      0.98      0.84      1101

    accuracy                           0.73      1816
   macro avg       0.59      0.38      0.39      1816
weighted 

### 2. MultinomialNB

In [27]:
pipe_2 = Pipeline(steps=[
    ('prep', pr),
    ('modelo_2', modelo_mnb)])

In [28]:
parametros_mnb = {'modelo_2__alpha':(0.1,0.5,1.0),
                  'modelo_2__fit_prior':[True]}

In [29]:
%%time
search_mnb = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 361 ms, sys: 12.4 ms, total: 374 ms
Wall time: 1.77 s


In [30]:
search_mnb.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [31]:
#predicción
y_pred_mnb = search_mnb.best_estimator_.predict(X_test)
y_pred_train_mnb = search_mnb.best_estimator_.predict(X_train)

In [32]:
report(y_train,y_pred_train_mnb,y_test,y_pred_mnb)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.99      0.98      0.99       429
           2       1.00      0.97      0.98       125
           3       0.99      0.91      0.95       163
           4       1.00      0.89      0.94       298
           5       0.97      1.00      0.98      1707

    accuracy                           0.98      2722
   macro avg       0.99      0.95      0.97      2722
weighted avg       0.98      0.98      0.98      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.83      0.76      0.79       305
           2       0.33      0.03      0.05        76
           3       0.00      0.00      0.00       109
           4       0.42      0.06      0.11       225
           5       0.73      0.99      0.84      1101

    accuracy                           0.74      1816
   macro avg       0.46      0.37      0.36      1816
weighted 

### 3. RandomForest

In [33]:
pipe_3 = Pipeline(steps=[
    ('prep', pr),
    ('modelo_3', modelo_rf)])

In [34]:
parametros_rf ={"modelo_3__ccp_alpha" : [0.001, 0.01, 0.1, 0.3, 0.5],
                "modelo_3__max_depth" : [3, 5, 10],
                "modelo_3__n_estimators": [250, 300, 500]}

In [35]:
%%time
search_rf = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 3.57 s, sys: 103 ms, total: 3.68 s
Wall time: 2min 9s


In [36]:
search_rf.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 3,
 'modelo_3__n_estimators': 250}

In [37]:
#predicción
y_pred_rf = search_rf.best_estimator_.predict(X_test)
y_pred_train_rf = search_rf.best_estimator_.predict(X_train)

In [38]:
report(y_train,y_pred_train_rf,y_test,y_pred_rf)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       429
           2       0.00      0.00      0.00       125
           3       0.00      0.00      0.00       163
           4       0.00      0.00      0.00       298
           5       0.63      1.00      0.77      1707

    accuracy                           0.63      2722
   macro avg       0.13      0.20      0.15      2722
weighted avg       0.39      0.63      0.48      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       305
           2       0.00      0.00      0.00        76
           3       0.00      0.00      0.00       109
           4       0.00      0.00      0.00       225
           5       0.61      1.00      0.75      1101

    accuracy                           0.61      1816
   macro avg       0.12      0.20      0.15      1816
weighted 

### 4. DecisionTreeClassifier

In [39]:
pipe_4 = Pipeline(steps=[
    ('prep', pr),
    ('modelo_4', modelo_dtc)])

In [40]:
parametros_dtc = {'modelo_4__max_depth':(30,45,50,55),
                  'modelo_4__min_samples_leaf': [1,75,100,125,150,175], 
                  'modelo_4__max_features':['sqrt','log2',None]}

In [41]:
%%time
search_dtc = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 4.86 s, sys: 130 ms, total: 4.99 s
Wall time: 43.1 s


In [42]:
search_dtc.best_params_

{'modelo_4__max_depth': 30,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [43]:
#predicción
y_pred_dtc = search_dtc.best_estimator_.predict(X_test)
y_pred_train_dtc = search_dtc.best_estimator_.predict(X_train)

In [44]:
report(y_train,y_pred_train_dtc,y_test,y_pred_dtc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.97      0.74      0.84       429
           2       0.93      0.50      0.65       125
           3       1.00      0.42      0.59       163
           4       0.99      0.45      0.62       298
           5       0.80      1.00      0.89      1707

    accuracy                           0.84      2722
   macro avg       0.94      0.62      0.72      2722
weighted avg       0.87      0.84      0.82      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.80      0.61      0.69       305
           2       0.24      0.11      0.15        76
           3       0.17      0.02      0.03       109
           4       0.34      0.10      0.15       225
           5       0.72      0.96      0.82      1101

    accuracy                           0.70      1816
   macro avg       0.45      0.36      0.37      1816
weighted 

### 5. AdaBoostClassifier

In [45]:
pipe_5 = Pipeline(steps=[
    ('prep', pr),
    ('modelo_5', modelo_abc)])

In [46]:
parametros_abc = {'modelo_5__base_estimator' : [DecisionTreeClassifier(max_depth=2)],
                  'modelo_5__n_estimators': [1, 50, 100, 500]}

In [47]:
%%time
search_abc = GridSearchCV(pipe_5, parametros_abc, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 1.56 s, sys: 15.7 ms, total: 1.57 s
Wall time: 44.1 s


In [48]:
search_abc.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 50}

In [49]:
#predicción
y_pred_abc = search_abc.best_estimator_.predict(X_test)
y_pred_train_abc = search_abc.best_estimator_.predict(X_train)

In [50]:
report(y_train,y_pred_train_abc,y_test,y_pred_abc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.79      0.58      0.67       429
           2       0.61      0.48      0.54       125
           3       0.55      0.10      0.17       163
           4       0.43      0.10      0.16       298
           5       0.75      0.98      0.85      1707

    accuracy                           0.74      2722
   macro avg       0.63      0.45      0.48      2722
weighted avg       0.70      0.74      0.69      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.78      0.55      0.65       305
           2       0.17      0.12      0.14        76
           3       0.30      0.06      0.09       109
           4       0.29      0.04      0.07       225
           5       0.71      0.96      0.82      1101

    accuracy                           0.69      1816
   macro avg       0.45      0.35      0.35      1816
weighted 

### 6. GradientBoostingClassifier

In [51]:
pipe_6 = Pipeline(steps=[
    ('prep', pr),
    ('modelo_6', modelo_gbc)])

In [52]:
parametros_gbc = {"modelo_6__learning_rate": [0.01, 0.1, 0.5],
                  "modelo_6__n_estimators" : [50, 100, 500, 1000],
                  "modelo_6__subsample": [0.1,0.5,0.9]}

In [53]:
%%time
search_gbc = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train, y_train)

CPU times: user 2min 15s, sys: 418 ms, total: 2min 16s
Wall time: 49min 40s


In [54]:
search_gbc.best_params_

{'modelo_6__learning_rate': 0.01,
 'modelo_6__n_estimators': 1000,
 'modelo_6__subsample': 0.9}

In [55]:
#predicción
y_pred_gbc = search_gbc.best_estimator_.predict(X_test)
y_pred_train_gbc = search_gbc.best_estimator_.predict(X_train)

In [56]:
report(y_train,y_pred_train_gbc,y_test,y_pred_gbc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.93      0.70      0.80       429
           2       1.00      0.29      0.45       125
           3       0.95      0.23      0.37       163
           4       0.94      0.26      0.40       298
           5       0.76      1.00      0.86      1707

    accuracy                           0.79      2722
   macro avg       0.92      0.50      0.58      2722
weighted avg       0.83      0.79      0.75      2722


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.84      0.65      0.74       305
           2       0.45      0.07      0.11        76
           3       0.20      0.02      0.03       109
           4       0.39      0.06      0.11       225
           5       0.71      0.98      0.83      1101

    accuracy                           0.72      1816
   macro avg       0.52      0.36      0.36      1816
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 65/35</div></H1>

### 1. LinearSVC

In [57]:
%%time
search_svc_2 = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 1.16 s, sys: 87.1 ms, total: 1.25 s
Wall time: 1min 13s


In [58]:
search_svc_2.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [59]:
#predicción
y_pred_svc_2 = search_svc_2.best_estimator_.predict(X_test_2)
y_pred_train_svc_2 = search_svc_2.best_estimator_.predict(X_train_2)

In [60]:
report(y_train_2,y_pred_train_svc_2,y_test_2,y_pred_svc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       471
           2       1.00      0.98      0.99       134
           3       1.00      0.94      0.97       172
           4       1.00      0.90      0.95       323
           5       0.97      1.00      0.99      1849

    accuracy                           0.98      2949
   macro avg       0.99      0.96      0.98      2949
weighted avg       0.98      0.98      0.98      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.78      0.74      0.76       263
           2       0.29      0.03      0.05        67
           3       0.47      0.08      0.14       100
           4       0.42      0.10      0.16       200
           5       0.74      0.98      0.84       959

    accuracy                           0.73      1589
   macro avg       0.54      0.39      0.39      1589
weighted 

### 2. MultinomialNB

In [61]:
%%time
search_mnb_2 = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 391 ms, sys: 13.9 ms, total: 405 ms
Wall time: 2.08 s


In [62]:
search_mnb_2.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [63]:
#predicción
y_pred_mnb_2 = search_mnb_2.best_estimator_.predict(X_test_2)
y_pred_train_mnb_2 = search_mnb_2.best_estimator_.predict(X_train_2)

In [64]:
report(y_train_2,y_pred_train_mnb_2,y_test_2,y_pred_mnb_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.99      0.98      0.99       471
           2       1.00      0.97      0.98       134
           3       0.99      0.91      0.95       172
           4       1.00      0.88      0.94       323
           5       0.97      1.00      0.98      1849

    accuracy                           0.98      2949
   macro avg       0.99      0.95      0.97      2949
weighted avg       0.98      0.98      0.98      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.83      0.78      0.81       263
           2       0.33      0.03      0.05        67
           3       0.00      0.00      0.00       100
           4       0.43      0.05      0.09       200
           5       0.73      0.99      0.84       959

    accuracy                           0.74      1589
   macro avg       0.47      0.37      0.36      1589
weighted 

### 3. RandomForest

In [65]:
%%time
search_rf_2 = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 3.9 s, sys: 108 ms, total: 4 s
Wall time: 2min 19s


In [66]:
search_rf_2.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 10,
 'modelo_3__n_estimators': 250}

In [67]:
#predicción
y_pred_rf_2 = search_rf_2.best_estimator_.predict(X_test_2)
y_pred_train_rf_2 = search_rf_2.best_estimator_.predict(X_train_2)

In [68]:
report(y_train_2,y_pred_train_rf_2,y_test_2,y_pred_rf_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.00      0.00       471
           2       0.00      0.00      0.00       134
           3       0.00      0.00      0.00       172
           4       0.00      0.00      0.00       323
           5       0.63      1.00      0.77      1849

    accuracy                           0.63      2949
   macro avg       0.33      0.20      0.16      2949
weighted avg       0.55      0.63      0.48      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       263
           2       0.00      0.00      0.00        67
           3       0.00      0.00      0.00       100
           4       0.00      0.00      0.00       200
           5       0.60      1.00      0.75       959

    accuracy                           0.60      1589
   macro avg       0.12      0.20      0.15      1589
weighted 

### 4. DecisionTreeClassifier

In [69]:
%%time
search_dtc_2 = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 5.12 s, sys: 145 ms, total: 5.27 s
Wall time: 44.5 s


In [70]:
search_dtc_2.best_params_

{'modelo_4__max_depth': 30,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [71]:
#predicción
y_pred_dtc_2 = search_dtc_2.best_estimator_.predict(X_test_2)
y_pred_train_dtc_2 = search_dtc_2.best_estimator_.predict(X_train_2)

In [72]:
report(y_train_2,y_pred_train_dtc_2,y_test_2,y_pred_dtc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.95      0.76      0.84       471
           2       1.00      0.41      0.58       134
           3       1.00      0.43      0.60       172
           4       0.90      0.46      0.61       323
           5       0.80      0.99      0.89      1849

    accuracy                           0.84      2949
   macro avg       0.93      0.61      0.70      2949
weighted avg       0.86      0.84      0.82      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.76      0.68      0.72       263
           2       0.27      0.04      0.08        67
           3       0.24      0.05      0.08       100
           4       0.28      0.07      0.12       200
           5       0.72      0.96      0.82       959

    accuracy                           0.70      1589
   macro avg       0.46      0.36      0.36      1589
weighted 

### 5. AdaBoostClassifier

In [73]:
%%time
search_abc_2 = GridSearchCV(pipe_5, parametros_abc, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 2.17 s, sys: 45.6 ms, total: 2.22 s
Wall time: 1min 6s


In [74]:
search_abc_2.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 50}

In [75]:
#predicción
y_pred_abc_2 = search_abc_2.best_estimator_.predict(X_test_2)
y_pred_train_abc_2 = search_abc_2.best_estimator_.predict(X_train_2)

In [76]:
report(y_train_2,y_pred_train_abc_2,y_test_2,y_pred_abc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.81      0.59      0.69       471
           2       0.66      0.46      0.54       134
           3       0.58      0.18      0.28       172
           4       0.43      0.09      0.15       323
           5       0.75      0.97      0.85      1849

    accuracy                           0.75      2949
   macro avg       0.65      0.46      0.50      2949
weighted avg       0.71      0.75      0.70      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.78      0.58      0.67       263
           2       0.14      0.07      0.10        67
           3       0.19      0.06      0.09       100
           4       0.38      0.04      0.08       200
           5       0.71      0.96      0.81       959

    accuracy                           0.69      1589
   macro avg       0.44      0.34      0.35      1589
weighted 

### 6. GradientBoostingClassifier

In [77]:
%%time
search_gbc_2 = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 2min 27s, sys: 350 ms, total: 2min 27s
Wall time: 32min 54s


In [78]:
search_gbc_2.best_params_

{'modelo_6__learning_rate': 0.01,
 'modelo_6__n_estimators': 1000,
 'modelo_6__subsample': 0.9}

In [79]:
#predicción
y_pred_gbc_2 = search_gbc_2.best_estimator_.predict(X_test_2)
y_pred_train_gbc_2 = search_gbc_2.best_estimator_.predict(X_train_2)

In [80]:
report(y_train_2,y_pred_train_gbc_2,y_test_2,y_pred_gbc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.92      0.70      0.79       471
           2       1.00      0.26      0.41       134
           3       0.95      0.23      0.37       172
           4       0.93      0.26      0.41       323
           5       0.76      1.00      0.86      1849

    accuracy                           0.79      2949
   macro avg       0.91      0.49      0.57      2949
weighted avg       0.83      0.79      0.75      2949


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.84      0.66      0.74       263
           2       0.31      0.06      0.10        67
           3       0.30      0.03      0.05       100
           4       0.35      0.04      0.07       200
           5       0.70      0.98      0.82       959

    accuracy                           0.71      1589
   macro avg       0.50      0.35      0.36      1589
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 60/40</div></H1>

### 1. LinearSVC

In [81]:
%%time
search_svc_3 = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 1.34 s, sys: 104 ms, total: 1.44 s
Wall time: 1min 24s


In [82]:
search_svc_3.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [83]:
#predicción
y_pred_svc_3 = search_svc_3.best_estimator_.predict(X_test_3)
y_pred_train_svc_3 = search_svc_3.best_estimator_.predict(X_train_3)

In [84]:
report(y_train_3,y_pred_train_svc_3,y_test_3,y_pred_svc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       507
           2       1.00      0.97      0.99       143
           3       1.00      0.95      0.98       189
           4       1.00      0.90      0.95       352
           5       0.97      1.00      0.99      1985

    accuracy                           0.98      3176
   macro avg       0.99      0.96      0.98      3176
weighted avg       0.98      0.98      0.98      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.77      0.76      0.76       227
           2       0.40      0.03      0.06        58
           3       0.41      0.08      0.14        83
           4       0.38      0.11      0.17       171
           5       0.75      0.97      0.85       823

    accuracy                           0.73      1362
   macro avg       0.54      0.39      0.40      1362
weighted 

### 2. MultinomialNB

In [85]:
%%time
search_mnb_3 = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 406 ms, sys: 15.4 ms, total: 421 ms
Wall time: 2.27 s


In [86]:
search_mnb_3.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [87]:
#predicción
y_pred_mnb_3 = search_mnb_3.best_estimator_.predict(X_test_3)
y_pred_train_mnb_3 = search_mnb_3.best_estimator_.predict(X_train_3)

In [88]:
report(y_train_3,y_pred_train_mnb_3,y_test_3,y_pred_mnb_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.99      0.98      0.99       507
           2       1.00      0.96      0.98       143
           3       0.99      0.92      0.96       189
           4       1.00      0.88      0.94       352
           5       0.97      1.00      0.98      1985

    accuracy                           0.98      3176
   macro avg       0.99      0.95      0.97      3176
weighted avg       0.98      0.98      0.98      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.83      0.78      0.80       227
           2       0.50      0.03      0.06        58
           3       0.00      0.00      0.00        83
           4       0.50      0.06      0.10       171
           5       0.73      1.00      0.84       823

    accuracy                           0.74      1362
   macro avg       0.51      0.37      0.36      1362
weighted 

### 3. RandomForest

In [89]:
%%time
search_rf_3 = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 3.78 s, sys: 101 ms, total: 3.88 s
Wall time: 2min 12s


In [90]:
search_rf_3.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 10,
 'modelo_3__n_estimators': 250}

In [91]:
#predicción
y_pred_rf_3 = search_rf_3.best_estimator_.predict(X_test_3)
y_pred_train_rf_3 = search_rf_3.best_estimator_.predict(X_train_3)

In [92]:
report(y_train_3,y_pred_train_rf_3,y_test_3,y_pred_rf_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.00      0.00       507
           2       0.00      0.00      0.00       143
           3       0.00      0.00      0.00       189
           4       0.00      0.00      0.00       352
           5       0.63      1.00      0.77      1985

    accuracy                           0.63      3176
   macro avg       0.33      0.20      0.15      3176
weighted avg       0.55      0.63      0.48      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       227
           2       0.00      0.00      0.00        58
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00       171
           5       0.60      1.00      0.75       823

    accuracy                           0.60      1362
   macro avg       0.12      0.20      0.15      1362
weighted 

### 4. DecisionTreeClassifier

In [93]:
%%time
search_dtc_3 = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 5.86 s, sys: 162 ms, total: 6.03 s
Wall time: 59.3 s


In [94]:
search_dtc_3.best_params_

{'modelo_4__max_depth': 30,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [95]:
#predicción
y_pred_dtc_3 = search_dtc_3.best_estimator_.predict(X_test_3)
y_pred_train_dtc_3 = search_dtc_3.best_estimator_.predict(X_train_3)

In [96]:
report(y_train_3,y_pred_train_dtc_3,y_test_3,y_pred_dtc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.98      0.74      0.85       507
           2       1.00      0.45      0.62       143
           3       1.00      0.37      0.54       189
           4       0.99      0.39      0.56       352
           5       0.79      1.00      0.88      1985

    accuracy                           0.83      3176
   macro avg       0.95      0.59      0.69      3176
weighted avg       0.86      0.83      0.81      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.81      0.68      0.74       227
           2       0.25      0.09      0.13        58
           3       0.29      0.07      0.12        83
           4       0.41      0.11      0.17       171
           5       0.72      0.95      0.82       823

    accuracy                           0.71      1362
   macro avg       0.50      0.38      0.39      1362
weighted 

### 5. AdaBoostClassifier

In [97]:
%%time
search_abc_3 = GridSearchCV(pipe_5, parametros_abc, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 1.79 s, sys: 20.7 ms, total: 1.81 s
Wall time: 59.8 s


In [98]:
search_abc_3.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 50}

In [99]:
#predicción
y_pred_abc_3 = search_abc_3.best_estimator_.predict(X_test_3)
y_pred_train_abc_3 = search_abc_3.best_estimator_.predict(X_train_3)

In [100]:
report(y_train_3,y_pred_train_abc_3,y_test_3,y_pred_abc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.82      0.64      0.72       507
           2       0.80      0.46      0.59       143
           3       0.48      0.15      0.23       189
           4       0.43      0.07      0.13       352
           5       0.75      0.98      0.85      1985

    accuracy                           0.75      3176
   macro avg       0.66      0.46      0.50      3176
weighted avg       0.71      0.75      0.70      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.81      0.63      0.71       227
           2       0.22      0.10      0.14        58
           3       0.21      0.06      0.09        83
           4       0.35      0.04      0.07       171
           5       0.71      0.96      0.82       823

    accuracy                           0.70      1362
   macro avg       0.46      0.36      0.37      1362
weighted 

### 6. GradientBoostingClassifier

In [105]:
%%time
search_gbc_3 = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 2min 58s, sys: 3.34 s, total: 3min 1s
Wall time: 50min 2s


In [106]:
search_gbc_3.best_params_

{'modelo_6__learning_rate': 0.01,
 'modelo_6__n_estimators': 1000,
 'modelo_6__subsample': 0.5}

In [107]:
#predicción
y_pred_gbc_3 = search_gbc_3.best_estimator_.predict(X_test_3)
y_pred_train_gbc_3 = search_gbc_3.best_estimator_.predict(X_train_3)

In [108]:
report(y_train_3,y_pred_train_gbc_3,y_test_3,y_pred_gbc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.91      0.68      0.78       507
           2       0.91      0.36      0.52       143
           3       0.93      0.30      0.45       189
           4       0.85      0.23      0.37       352
           5       0.77      1.00      0.87      1985

    accuracy                           0.79      3176
   macro avg       0.88      0.51      0.60      3176
weighted avg       0.82      0.79      0.76      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.84      0.68      0.75       227
           2       0.25      0.05      0.09        58
           3       0.30      0.04      0.06        83
           4       0.34      0.06      0.10       171
           5       0.72      0.98      0.83       823

    accuracy                           0.72      1362
   macro avg       0.49      0.36      0.37      1362
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Curva ROC</div></H1>

In [None]:
def plot_roc_curve(model, xtest_std, ytest):
    """
    plot_roc_curve - Plots ROC curve of given model.
    
    @parameters:
        - model: a `statsmodels.formula.api` class generated method, which must be already fitted.
        - xtest_std: numpy.ndarray. A standarized "x" sample, different from the one used in the model. Object.
        - ytest: pandas.core.series.Series. The sample to compare with the generated predicted values. Object.
        
    @returns:
        - A `matplotlib` object.
    """
    yhat = model.predict_proba(xtest_std)[:, 1]
    false_positive, true_positive, threshold = roc_curve(ytest, yhat)
    # Plot ROC curve
    plt.title('Curva ROC')
    plt.plot(false_positive, true_positive, lw=1)
    plt.plot([0, 1], ls="--", lw=1)
    plt.plot([0, 0], [1, 0] , c='limegreen', lw=3), plt.plot([1, 1] , c='limegreen', lw=3)
    plt.ylabel('Verdaderos Positivos')
    plt.xlabel('Falsos Positivos');