<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Preliminares</div></H1>

In [1]:
import pandas as pd
import numpy as np
import csv
import re
import h2_funciones_auxiliares as faux

# Para preprocesar datos y realizar wordcloud.
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
from string import punctuation

# Para implementar modelos de machine learning.
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_curve
from sklearn import set_config
set_config(display = 'diagram')

# Para recodificación de atributos
from category_encoders import TargetEncoder, OrdinalEncoder, OneHotEncoder

from datetime import datetime

# Para visualización
import graphviz

# Para serialización
import pickle

# Para evitar mensajes de deprecación
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# FUNCIÓN PARA OBTENER CLASSIFICATION REPORT:
def report(y_train, y_pred_train, y_test, y_pred):
    print("Classification report - TRAIN SET")
    print(classification_report(y_train, y_pred_train))
    print("\nClassification report - TEST SET")
    print(classification_report(y_test, y_pred))

# FUNCIÓN PARA CLASSIFICATION REPORT - SÓLO TEST SET
def test_classification_report(nombre_modelo, y_test, y_hat):
    print("\nTEST SET - Classification report - {}".format(nombre_modelo.upper()) )
    print(classification_report(y_test, y_hat))

In [3]:
df = pd.read_csv('mod_ripley.csv').drop(columns = 'comentario')

In [4]:
df.head(3)

Unnamed: 0,retail,categoria,producto,precio_original,precio_internet,precio_oferta,rating,comentario_sin_re
0,ripley,belleza,60 SLIM PATCH,16990,9990,0,4,personalmente lo ocupe solo por hr y de noche ...
1,ripley,belleza,AUTOBRONCEADOR CLINIQUE FACE BRONZING GEL TINT...,51990,25990,0,5,es muy bueno para el rostro
2,ripley,belleza,,0,0,0,5,me encanta se absorbe rapido rico aroma de la ...


In [5]:
df = df.drop(df[df['rating'] == 0].index).dropna()

In [6]:
df.shape

(4538, 8)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>CountVectorizer</div></H1>

In [7]:
sw = set(stopwords.words('spanish') + list(punctuation)+ ['mas', 'si','ve','asi','menos','igual','ser','tal','aun',
                                                          'viene','ahora','puede','parte','todas'])

In [8]:
pr = CountVectorizer(analyzer = 'word',
                    stop_words = sw, 
                    min_df = 1,
                    ngram_range =(1,3),
                    lowercase = True)

In [9]:
cv_fit = pr.fit_transform(df['comentario_sin_re'])

In [10]:
#extraer palabras
wrds = pr.get_feature_names()
wrds_freq = cv_fit.toarray().sum(axis=0)
df_wrd = pd.DataFrame({'words':wrds, 'frecuencia':wrds_freq}).sort_values(by='frecuencia', ascending=False)
df_wrd[:20].T

Unnamed: 0,52504,9817,27249,8107,7714,6122,62923,15420,62205,56380,8144,38142,13413,24449,64133,7922,14890,51310,29740,8745
words,producto,calidad,excelente,buena,buen,bien,talla,compre,super,recomiendo,buena calidad,lindo,color,encanto,tela,buen producto,compra,precio,foto,bueno
frecuencia,885,712,709,629,517,498,494,447,352,341,306,295,287,283,267,252,247,227,224,224


<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Matriz de atributos y vector objetivo</div></H1>

In [11]:
# Matriz de atributos
X = df.drop(columns = 'rating')

# Vector objetivo
y = df['rating']

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>División de la muestra</div></H1>

In [12]:
# División de la muestra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Definiciones pasos Pipeline</div></H1>

In [13]:
is_cat = ['categoria','producto']
var_vec = 'comentario_sin_re'

In [14]:
categoricas = Pipeline(steps = [
    ("ohe", OneHotEncoder(use_cat_names = True))
])

vectorizada = Pipeline(steps = [
    ("pr_vec", pr)
])

prep = ColumnTransformer(transformers = [
    ("cat", categoricas, is_cat),
    ("vec",vectorizada, var_vec)])

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Modelos a implementar</div></H1>

In [15]:
# Declaración de modelos
modelo_svc = LinearSVC(random_state = 123)
modelo_mnb = MultinomialNB()
modelo_rf = RandomForestClassifier(random_state = 123)
modelo_dtc = DecisionTreeClassifier(random_state= 123)
modelo_abc = AdaBoostClassifier(random_state = 123)
modelo_gbc = GradientBoostingClassifier(random_state = 123)

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 70/30</div></H1>

### 1. LinearSVC

In [16]:
pipe_1 = Pipeline(steps = [
    ("prep", prep),
    ("modelo_1", modelo_svc)
])

In [17]:
parametros_svc = {'modelo_1__C':[0.01,0.1,1,10, 100, 1000],
                  'modelo_1__max_iter': [1000,10000]}

In [18]:
%%time
search_svc = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 1.92 s, sys: 275 ms, total: 2.2 s
Wall time: 1min


In [19]:
search_svc.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [20]:
#predicción
y_pred_svc = search_svc.best_estimator_.predict(X_test)
y_pred_train_svc = search_svc.best_estimator_.predict(X_train)

In [21]:
report(y_train,y_pred_train_svc,y_test,y_pred_svc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       507
           2       1.00      0.99      1.00       143
           3       1.00      0.97      0.99       189
           4       1.00      0.92      0.96       352
           5       0.98      1.00      0.99      1985

    accuracy                           0.99      3176
   macro avg       1.00      0.97      0.98      3176
weighted avg       0.99      0.99      0.99      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.79      0.75      0.77       227
           2       0.43      0.05      0.09        58
           3       0.41      0.08      0.14        83
           4       0.47      0.14      0.22       171
           5       0.75      0.98      0.85       823

    accuracy                           0.74      1362
   macro avg       0.57      0.40      0.41      1362
weighted 

### 2. MultinomialNB

In [22]:
pipe_2 = Pipeline(steps=[
    ('prep', prep),
    ('modelo_2', modelo_mnb)])

In [23]:
parametros_mnb = {'modelo_2__alpha':(0.1,0.5,1.0),
                  'modelo_2__fit_prior':[True]}

In [24]:
%%time
search_mnb = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 901 ms, sys: 155 ms, total: 1.06 s
Wall time: 7.89 s


In [25]:
search_mnb.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [26]:
#predicción
y_pred_mnb = search_mnb.best_estimator_.predict(X_test)
y_pred_train_mnb = search_mnb.best_estimator_.predict(X_train)

In [27]:
report(y_train,y_pred_train_mnb,y_test,y_pred_mnb)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.98      0.99       507
           2       1.00      0.96      0.98       143
           3       0.99      0.91      0.95       189
           4       1.00      0.88      0.94       352
           5       0.96      1.00      0.98      1985

    accuracy                           0.98      3176
   macro avg       0.99      0.95      0.97      3176
weighted avg       0.98      0.98      0.98      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.83      0.74      0.78       227
           2       0.50      0.03      0.06        58
           3       0.00      0.00      0.00        83
           4       0.62      0.06      0.11       171
           5       0.72      1.00      0.83       823

    accuracy                           0.73      1362
   macro avg       0.53      0.36      0.36      1362
weighted 

### 3. RandomForest

In [28]:
pipe_3 = Pipeline(steps=[
    ('prep', prep),
    ('modelo_3', modelo_rf)])

In [29]:
parametros_rf ={"modelo_3__ccp_alpha" : [0.001, 0.01, 0.1, 0.3, 0.5],
                "modelo_3__max_depth" : [3, 5, 10],
                "modelo_3__n_estimators": [250, 300, 500]}

In [30]:
%%time
search_rf = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 5.74 s, sys: 330 ms, total: 6.07 s
Wall time: 4min 13s


In [31]:
search_rf.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 10,
 'modelo_3__n_estimators': 250}

In [32]:
#predicción
y_pred_rf = search_rf.best_estimator_.predict(X_test)
y_pred_train_rf = search_rf.best_estimator_.predict(X_train)

In [33]:
report(y_train,y_pred_train_rf,y_test,y_pred_rf)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.00      0.00       507
           2       0.00      0.00      0.00       143
           3       0.00      0.00      0.00       189
           4       0.00      0.00      0.00       352
           5       0.63      1.00      0.77      1985

    accuracy                           0.63      3176
   macro avg       0.33      0.20      0.15      3176
weighted avg       0.55      0.63      0.48      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       227
           2       0.00      0.00      0.00        58
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00       171
           5       0.60      1.00      0.75       823

    accuracy                           0.60      1362
   macro avg       0.12      0.20      0.15      1362
weighted 

### 4. DecisionTreeClassifier

In [34]:
pipe_4 = Pipeline(steps=[
    ('prep', prep),
    ('modelo_4', modelo_dtc)])

In [35]:
parametros_dtc = {'modelo_4__max_depth':(30,45,50,55),
                  'modelo_4__min_samples_leaf': [1,75,100,125,150,175], 
                  'modelo_4__max_features':['sqrt','log2',None]}

In [36]:
%%time
search_dtc = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train, y_train)

CPU times: user 7.81 s, sys: 399 ms, total: 8.21 s
Wall time: 2min 45s


In [37]:
search_dtc.best_params_

{'modelo_4__max_depth': 30,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [38]:
#predicción
y_pred_dtc = search_dtc.best_estimator_.predict(X_test)
y_pred_train_dtc = search_dtc.best_estimator_.predict(X_train)

In [39]:
report(y_train,y_pred_train_dtc,y_test,y_pred_dtc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.98      0.75      0.85       507
           2       1.00      0.52      0.68       143
           3       0.99      0.44      0.61       189
           4       0.99      0.47      0.64       352
           5       0.81      1.00      0.89      1985

    accuracy                           0.85      3176
   macro avg       0.95      0.63      0.73      3176
weighted avg       0.87      0.85      0.83      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.79      0.69      0.74       227
           2       0.45      0.09      0.14        58
           3       0.06      0.01      0.02        83
           4       0.34      0.09      0.15       171
           5       0.71      0.95      0.81       823

    accuracy                           0.70      1362
   macro avg       0.47      0.37      0.37      1362
weighted 

### 5. AdaBoostClassifier

In [40]:
pipe_5 = Pipeline(steps=[
    ('prep', prep),
    ('modelo_5', modelo_abc)])

In [41]:
parametros_abc = {'modelo_5__base_estimator' : [DecisionTreeClassifier(max_depth=2)],
                  'modelo_5__n_estimators': [1, 50, 100, 500]}

In [42]:
%%time
search_abc = GridSearchCV(pipe_5, parametros_abc, cv =3,n_jobs=-1).fit(X_train, y_train)

CPU times: user 2.61 s, sys: 159 ms, total: 2.77 s
Wall time: 39.8 s


In [43]:
search_abc.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 50}

In [44]:
#predicción
y_pred_abc = search_abc.best_estimator_.predict(X_test)
y_pred_train_abc = search_abc.best_estimator_.predict(X_train)

In [45]:
report(y_train,y_pred_train_abc,y_test,y_pred_abc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.84      0.63      0.72       507
           2       0.63      0.43      0.51       143
           3       0.42      0.18      0.25       189
           4       0.41      0.08      0.13       352
           5       0.76      0.97      0.85      1985

    accuracy                           0.75      3176
   macro avg       0.61      0.46      0.49      3176
weighted avg       0.70      0.75      0.70      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.76      0.59      0.66       227
           2       0.28      0.12      0.17        58
           3       0.35      0.08      0.14        83
           4       0.33      0.06      0.10       171
           5       0.71      0.96      0.82       823

    accuracy                           0.70      1362
   macro avg       0.49      0.36      0.38      1362
weighted 

### 6. GradientBoostingClassifier

In [46]:
pipe_6 = Pipeline(steps=[
    ('prep', prep),
    ('modelo_6', modelo_gbc)])

In [47]:
parametros_gbc = {"modelo_6__learning_rate": [0.01, 0.1, 0.5],
                  "modelo_6__n_estimators" : [50, 100, 500, 1000],
                  "modelo_6__subsample": [0.1,0.5,0.9]}

In [48]:
%%time
search_gbc = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train, y_train)

CPU times: user 2min 14s, sys: 504 ms, total: 2min 15s
Wall time: 41min 38s


In [49]:
search_gbc.best_params_

{'modelo_6__learning_rate': 0.01,
 'modelo_6__n_estimators': 1000,
 'modelo_6__subsample': 0.5}

In [50]:
#predicción
y_pred_gbc = search_gbc.best_estimator_.predict(X_test)
y_pred_train_gbc = search_gbc.best_estimator_.predict(X_train)

In [51]:
report(y_train,y_pred_train_gbc,y_test,y_pred_gbc)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.92      0.68      0.78       507
           2       0.94      0.36      0.52       143
           3       0.90      0.30      0.45       189
           4       0.84      0.22      0.35       352
           5       0.76      1.00      0.87      1985

    accuracy                           0.79      3176
   macro avg       0.88      0.51      0.59      3176
weighted avg       0.81      0.79      0.76      3176


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.85      0.66      0.74       227
           2       0.27      0.05      0.09        58
           3       0.45      0.06      0.11        83
           4       0.40      0.06      0.10       171
           5       0.71      0.98      0.82       823

    accuracy                           0.72      1362
   macro avg       0.54      0.36      0.37      1362
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 65/35</div></H1>

In [52]:
# Proporción 65/35
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.35, random_state = 123)

### 1. LinearSVC

In [53]:
%%time
search_svc_2 = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 1.32 s, sys: 86.3 ms, total: 1.41 s
Wall time: 14 s


In [54]:
search_svc_2.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [55]:
#predicción
y_pred_svc_2 = search_svc_2.best_estimator_.predict(X_test_2)
y_pred_train_svc_2 = search_svc_2.best_estimator_.predict(X_train_2)

In [56]:
report(y_train_2,y_pred_train_svc_2,y_test_2,y_pred_svc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       339
           2       1.00      0.99      0.99       100
           3       1.00      0.98      0.99       121
           4       1.00      0.94      0.97       224
           5       0.99      1.00      0.99      1280

    accuracy                           0.99      2064
   macro avg       1.00      0.98      0.99      2064
weighted avg       0.99      0.99      0.99      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.75      0.71      0.73       168
           2       0.00      0.00      0.00        43
           3       0.67      0.09      0.16        68
           4       0.52      0.09      0.16       128
           5       0.75      0.98      0.85       705

    accuracy                           0.75      1112
   macro avg       0.54      0.37      0.38      1112
weighted 

### 2. MultinomialNB

In [57]:
%%time
search_mnb_2 = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 510 ms, sys: 63.6 ms, total: 573 ms
Wall time: 2.89 s


In [58]:
search_mnb_2.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [59]:
#predicción
y_pred_mnb_2 = search_mnb_2.best_estimator_.predict(X_test_2)
y_pred_train_mnb_2 = search_mnb_2.best_estimator_.predict(X_train_2)

In [60]:
report(y_train_2,y_pred_train_mnb_2,y_test_2,y_pred_mnb_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       339
           2       1.00      0.96      0.98       100
           3       1.00      0.95      0.97       121
           4       1.00      0.91      0.95       224
           5       0.97      1.00      0.99      1280

    accuracy                           0.98      2064
   macro avg       0.99      0.96      0.98      2064
weighted avg       0.98      0.98      0.98      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.81      0.70      0.75       168
           2       0.00      0.00      0.00        43
           3       1.00      0.01      0.03        68
           4       0.58      0.05      0.10       128
           5       0.74      0.99      0.84       705

    accuracy                           0.74      1112
   macro avg       0.63      0.35      0.35      1112
weighted 

### 3. RandomForest

In [61]:
%%time
search_rf_2 = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 4.73 s, sys: 159 ms, total: 4.89 s
Wall time: 2min 12s


In [62]:
search_rf_2.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 10,
 'modelo_3__n_estimators': 250}

In [63]:
#predicción
y_pred_rf_2 = search_rf_2.best_estimator_.predict(X_test_2)
y_pred_train_rf_2 = search_rf_2.best_estimator_.predict(X_train_2)

In [64]:
report(y_train_2,y_pred_train_rf_2,y_test_2,y_pred_rf_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.01      0.01       339
           2       0.00      0.00      0.00       100
           3       0.00      0.00      0.00       121
           4       0.00      0.00      0.00       224
           5       0.62      1.00      0.77      1280

    accuracy                           0.62      2064
   macro avg       0.32      0.20      0.16      2064
weighted avg       0.55      0.62      0.48      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       168
           2       0.00      0.00      0.00        43
           3       0.00      0.00      0.00        68
           4       0.00      0.00      0.00       128
           5       0.63      1.00      0.78       705

    accuracy                           0.63      1112
   macro avg       0.13      0.20      0.16      1112
weighted 

### 4. DecisionTreeClassifier

In [65]:
%%time
search_dtc_2 = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 6.23 s, sys: 208 ms, total: 6.44 s
Wall time: 1min 6s


In [66]:
search_dtc_2.best_params_

{'modelo_4__max_depth': 30,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [67]:
#predicción
y_pred_dtc_2 = search_dtc_2.best_estimator_.predict(X_test_2)
y_pred_train_dtc_2 = search_dtc_2.best_estimator_.predict(X_train_2)

In [68]:
report(y_train_2,y_pred_train_dtc_2,y_test_2,y_pred_dtc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.75      0.86       339
           2       1.00      0.56      0.72       100
           3       1.00      0.41      0.58       121
           4       1.00      0.55      0.71       224
           5       0.81      1.00      0.90      1280

    accuracy                           0.85      2064
   macro avg       0.96      0.65      0.75      2064
weighted avg       0.88      0.85      0.84      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.77      0.58      0.66       168
           2       0.13      0.05      0.07        43
           3       0.08      0.01      0.02        68
           4       0.24      0.07      0.11       128
           5       0.73      0.95      0.83       705

    accuracy                           0.70      1112
   macro avg       0.39      0.33      0.34      1112
weighted 

### 5. AdaBoostClassifier

In [69]:
%%time
search_abc_2 = GridSearchCV(pipe_5, parametros_abc, cv =3,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 503 ms, sys: 55.9 ms, total: 559 ms
Wall time: 16.5 s


In [70]:
search_abc_2.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 1}

In [71]:
#predicción
y_pred_abc_2 = search_abc_2.best_estimator_.predict(X_test_2)
y_pred_train_abc_2 = search_abc_2.best_estimator_.predict(X_train_2)

In [72]:
report(y_train_2,y_pred_train_abc_2,y_test_2,y_pred_abc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.85      0.30      0.45       339
           2       0.00      0.00      0.00       100
           3       1.00      0.02      0.03       121
           4       0.00      0.00      0.00       224
           5       0.66      1.00      0.79      1280

    accuracy                           0.67      2064
   macro avg       0.50      0.26      0.25      2064
weighted avg       0.61      0.67      0.57      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.89      0.32      0.47       168
           2       0.00      0.00      0.00        43
           3       0.00      0.00      0.00        68
           4       0.00      0.00      0.00       128
           5       0.67      1.00      0.80       705

    accuracy                           0.68      1112
   macro avg       0.31      0.26      0.25      1112
weighted 

### 6. GradientBoostingClassifier

In [73]:
%%time
search_gbc_2 = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train_2, y_train_2)

CPU times: user 12.4 s, sys: 133 ms, total: 12.5 s
Wall time: 26min 43s


In [74]:
search_gbc_2.best_params_

{'modelo_6__learning_rate': 0.1,
 'modelo_6__n_estimators': 100,
 'modelo_6__subsample': 0.9}

In [75]:
#predicción
y_pred_gbc_2 = search_gbc_2.best_estimator_.predict(X_test_2)
y_pred_train_gbc_2 = search_gbc_2.best_estimator_.predict(X_train_2)

In [76]:
report(y_train_2,y_pred_train_gbc_2,y_test_2,y_pred_gbc_2)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.95      0.74      0.83       339
           2       0.98      0.55      0.71       100
           3       0.98      0.37      0.54       121
           4       0.95      0.31      0.47       224
           5       0.79      1.00      0.88      1280

    accuracy                           0.82      2064
   macro avg       0.93      0.59      0.69      2064
weighted avg       0.85      0.82      0.80      2064


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.82      0.61      0.70       168
           2       0.25      0.02      0.04        43
           3       0.50      0.03      0.06        68
           4       0.20      0.03      0.05       128
           5       0.72      0.98      0.83       705

    accuracy                           0.72      1112
   macro avg       0.50      0.34      0.34      1112
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Entrenamiento 60/40</div></H1>

In [77]:
# Proporción 60/40
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_train, y_train,  test_size = 0.4, random_state = 123)

### 1. LinearSVC

In [78]:
%%time
search_svc_3 = GridSearchCV(pipe_1, parametros_svc, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 1.24 s, sys: 70.2 ms, total: 1.31 s
Wall time: 12.1 s


In [79]:
search_svc_3.best_params_

{'modelo_1__C': 0.1, 'modelo_1__max_iter': 1000}

In [80]:
#predicción
y_pred_svc_3 = search_svc_3.best_estimator_.predict(X_test_3)
y_pred_train_svc_3 = search_svc_3.best_estimator_.predict(X_train_3)

In [81]:
report(y_train_3,y_pred_train_svc_3,y_test_3,y_pred_svc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       313
           2       1.00      1.00      1.00        92
           3       1.00      0.98      0.99       113
           4       1.00      0.94      0.97       204
           5       0.99      1.00      0.99      1183

    accuracy                           0.99      1905
   macro avg       1.00      0.98      0.99      1905
weighted avg       0.99      0.99      0.99      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.76      0.70      0.73       194
           2       0.50      0.02      0.04        51
           3       0.73      0.11      0.18        76
           4       0.50      0.09      0.15       148
           5       0.75      0.98      0.85       802

    accuracy                           0.74      1271
   macro avg       0.65      0.38      0.39      1271
weighted 

### 2. MultinomialNB

In [82]:
%%time
search_mnb_3 = GridSearchCV(pipe_2, parametros_mnb, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 540 ms, sys: 120 ms, total: 660 ms
Wall time: 2.81 s


In [83]:
search_mnb_3.best_params_

{'modelo_2__alpha': 0.5, 'modelo_2__fit_prior': True}

In [84]:
#predicción
y_pred_mnb_3 = search_mnb_3.best_estimator_.predict(X_test_3)
y_pred_train_mnb_3 = search_mnb_3.best_estimator_.predict(X_train_3)

In [85]:
report(y_train_3,y_pred_train_mnb_3,y_test_3,y_pred_mnb_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.99      0.99      0.99       313
           2       1.00      0.96      0.98        92
           3       1.00      0.95      0.97       113
           4       0.99      0.90      0.95       204
           5       0.97      1.00      0.99      1183

    accuracy                           0.98      1905
   macro avg       0.99      0.96      0.97      1905
weighted avg       0.98      0.98      0.98      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.79      0.70      0.74       194
           2       1.00      0.02      0.04        51
           3       1.00      0.01      0.03        76
           4       0.50      0.03      0.06       148
           5       0.73      1.00      0.85       802

    accuracy                           0.74      1271
   macro avg       0.81      0.35      0.34      1271
weighted 

### 3. RandomForest

In [86]:
%%time
search_rf_3 = GridSearchCV(pipe_3, parametros_rf, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 4.77 s, sys: 155 ms, total: 4.92 s
Wall time: 2min 11s


In [87]:
search_rf_3.best_params_

{'modelo_3__ccp_alpha': 0.001,
 'modelo_3__max_depth': 10,
 'modelo_3__n_estimators': 250}

In [88]:
#predicción
y_pred_rf_3 = search_rf_3.best_estimator_.predict(X_test_3)
y_pred_train_rf_3 = search_rf_3.best_estimator_.predict(X_train_3)

In [89]:
report(y_train_3,y_pred_train_rf_3,y_test_3,y_pred_rf_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       1.00      0.01      0.01       313
           2       0.00      0.00      0.00        92
           3       0.00      0.00      0.00       113
           4       0.00      0.00      0.00       204
           5       0.62      1.00      0.77      1183

    accuracy                           0.62      1905
   macro avg       0.32      0.20      0.16      1905
weighted avg       0.55      0.62      0.48      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       194
           2       0.00      0.00      0.00        51
           3       0.00      0.00      0.00        76
           4       0.00      0.00      0.00       148
           5       0.63      1.00      0.77       802

    accuracy                           0.63      1271
   macro avg       0.13      0.20      0.15      1271
weighted 

### 4. DecisionTreeClassifier

In [90]:
%%time
search_dtc_3 = GridSearchCV(pipe_4, parametros_dtc, cv =5,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 6.22 s, sys: 195 ms, total: 6.42 s
Wall time: 1min 2s


In [91]:
search_dtc_3.best_params_

{'modelo_4__max_depth': 50,
 'modelo_4__max_features': None,
 'modelo_4__min_samples_leaf': 1}

In [92]:
#predicción
y_pred_dtc_3 = search_dtc_3.best_estimator_.predict(X_test_3)
y_pred_train_dtc_3 = search_dtc_3.best_estimator_.predict(X_train_3)

In [93]:
report(y_train_3,y_pred_train_dtc_3,y_test_3,y_pred_dtc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.99      0.81      0.89       313
           2       1.00      0.76      0.86        92
           3       1.00      0.70      0.82       113
           4       1.00      0.76      0.86       204
           5       0.88      1.00      0.93      1183

    accuracy                           0.91      1905
   macro avg       0.97      0.81      0.88      1905
weighted avg       0.92      0.91      0.91      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.72      0.58      0.64       194
           2       0.19      0.10      0.13        51
           3       0.18      0.11      0.13        76
           4       0.22      0.07      0.10       148
           5       0.74      0.92      0.82       802

    accuracy                           0.69      1271
   macro avg       0.41      0.35      0.37      1271
weighted 

### 5. AdaBoostClassifier

In [94]:
%%time
search_abc_3 = GridSearchCV(pipe_5, parametros_abc, cv =3,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 1.33 s, sys: 53 ms, total: 1.39 s
Wall time: 16.2 s


In [95]:
search_abc_3.best_params_

{'modelo_5__base_estimator': DecisionTreeClassifier(max_depth=2),
 'modelo_5__n_estimators': 50}

In [96]:
#predicción
y_pred_abc_3 = search_abc_3.best_estimator_.predict(X_test_3)
y_pred_train_abc_3 = search_abc_3.best_estimator_.predict(X_train_3)

In [97]:
report(y_train_3,y_pred_train_abc_3,y_test_3,y_pred_abc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.78      0.50      0.61       313
           2       0.68      0.57      0.62        92
           3       0.55      0.11      0.18       113
           4       0.39      0.04      0.08       204
           5       0.73      0.98      0.84      1183

    accuracy                           0.73      1905
   macro avg       0.63      0.44      0.46      1905
weighted avg       0.69      0.73      0.67      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.77      0.47      0.59       194
           2       0.00      0.00      0.00        51
           3       0.08      0.01      0.02        76
           4       0.00      0.00      0.00       148
           5       0.71      0.98      0.82       802

    accuracy                           0.69      1271
   macro avg       0.31      0.29      0.29      1271
weighted 

### 6. GradientBoostingClassifier

In [98]:
%%time
search_gbc_3 = GridSearchCV(pipe_6, parametros_gbc, cv =3,n_jobs=-1).fit(X_train_3, y_train_3)

CPU times: user 1min 40s, sys: 355 ms, total: 1min 41s
Wall time: 20min 14s


In [99]:
search_gbc_3.best_params_

{'modelo_6__learning_rate': 0.01,
 'modelo_6__n_estimators': 1000,
 'modelo_6__subsample': 0.9}

In [100]:
#predicción
y_pred_gbc_3 = search_gbc_3.best_estimator_.predict(X_test_3)
y_pred_train_gbc_3 = search_gbc_3.best_estimator_.predict(X_train_3)

In [101]:
report(y_train_3,y_pred_train_gbc_3,y_test_3,y_pred_gbc_3)

Classification report - TRAIN SET
              precision    recall  f1-score   support

           1       0.94      0.72      0.82       313
           2       0.97      0.38      0.55        92
           3       1.00      0.28      0.44       113
           4       0.97      0.32      0.48       204
           5       0.77      1.00      0.87      1183

    accuracy                           0.81      1905
   macro avg       0.93      0.54      0.63      1905
weighted avg       0.84      0.81      0.78      1905


Classification report - TEST SET
              precision    recall  f1-score   support

           1       0.83      0.61      0.70       194
           2       0.20      0.02      0.04        51
           3       0.60      0.04      0.07        76
           4       0.29      0.03      0.05       148
           5       0.72      0.99      0.83       802

    accuracy                           0.72      1271
   macro avg       0.53      0.34      0.34      1271
weighted 

<H1 align="center"> <div class="alert alert-block alert-info"><b></b>Apartado para probar funciones</div></H1>

### CURVA ROC

**El gráfico para la curva roc no soporta problemas multiclase**

In [46]:
def plot_roc_curve(model, xtest_std, ytest):
    """
    plot_roc_curve - Plots ROC curve of given model.
    
    @parameters:
        - model: a `statsmodels.formula.api` class generated method, which must be already fitted.
        - xtest_std: numpy.ndarray. A standarized "x" sample, different from the one used in the model. Object.
        - ytest: pandas.core.series.Series. The sample to compare with the generated predicted values. Object.
        
    @returns:
        - A `matplotlib` object.
    """
    yhat = model.predict_proba(xtest_std)[:, 1]
    false_positive, true_positive, threshold = roc_curve(ytest, yhat)
    # Plot ROC curve
    plt.title('Curva ROC')
    plt.plot(false_positive, true_positive, lw=1)
    plt.plot([0, 1], ls="--", lw=1)
    plt.plot([0, 0], [1, 0] , c='limegreen', lw=3), plt.plot([1, 1] , c='limegreen', lw=3)
    plt.ylabel('Verdaderos Positivos')
    plt.xlabel('Falsos Positivos');