# **Modelamiento**

## Librerías

In [None]:
!pip install scikit-learn==0.24.2

Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.9 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-3.0.0


In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 23.9 MB/s eta 0:00:01[K     |██▉                             | 20 kB 26.7 MB/s eta 0:00:01[K     |████▏                           | 30 kB 30.2 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 33.1 MB/s eta 0:00:01[K     |███████                         | 51 kB 34.3 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 35.0 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 29.5 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 29.0 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 30.3 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 32.2 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 32.2 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 32.2 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 32.2 MB/s e

In [None]:
#Generales
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Proceso de Modelación
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,median_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer

## Lectura Información

In [None]:
#Descargar datasets desde github
!git clone https://github.com/andres-soto-h/monografia-udea-eacd.git

Cloning into 'monografia-udea-eacd'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 11 (delta 3), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [None]:
#Lectura del dataset transformado
df_propiedades=pd.read_csv('/content/monografia-udea-eacd/df_prop_clean_12082021.csv', delimiter=';', encoding='latin1')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd '/content/drive/My Drive/Colab Notebooks/Seminario/DATASETS'
# # !ls
# df_propiedades=pd.read_csv('df_prop_clean_12082021.csv', delimiter=';', encoding='latin1') 
# print(df_propiedades.shape)

In [None]:
df_propiedades.rename(columns={"municipio":"ubicacion"}, inplace=True)

**NOTA:** se decide eliminar la información del municipio de **La Unión**.

In [None]:
mask_union = df_propiedades['ubicacion']!='la uniÃ³n'
df_propiedades = df_propiedades[mask_union]

In [None]:
df_propiedades['ubicacion'].value_counts()

rionegro                  1241
el retiro                  439
la ceja                    342
guarne                     196
marinilla                  117
el carmen de viboral        62
san antonio de pereira      38
llanogrande                 31
santuario                   15
Name: ubicacion, dtype: int64

**Nota2:** Se convierte la variable Estrato en ordinal

In [None]:
df_propiedades['estrato'] = df_propiedades['estrato'].replace('Campestre', '0')
df_propiedades['estrato'] = df_propiedades['estrato'].astype(int)
df_propiedades['estrato'].value_counts()

4    947
5    636
3    584
0    162
6     89
2     58
1      5
Name: estrato, dtype: int64

## **Ajuste del Modelo**

**Funciones**

In [None]:
#Métricas datos transformados
def median_absolute_error2(y_true, y_pred):
  return median_absolute_error(np.expm1(y_true), np.expm1(y_pred))

def mean_squared_error2(y_true, y_pred):
  return mean_squared_error(np.expm1(y_true), np.expm1(y_pred), squared=False)


def mean_absolute_percentage_error2(y_true, y_pred):
  return mean_absolute_percentage_error(np.expm1(y_true), np.expm1(y_pred))
  
def r2_score2(y_true, y_pred):
  return r2_score(np.expm1(y_true), np.expm1(y_pred))

In [None]:
def div_train_test(X, y, var_stratify = ''):
  
  if var_stratify != '': 
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9, stratify = X[var_stratify],  random_state = 42)

    var_stratify_train = x_train[var_stratify]
    #ubicacion_test = X_test['ubicacion']

    x_train.drop(var_stratify,axis=1, inplace=True)
    x_test.drop(var_stratify,axis=1, inplace=True)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)

    return x_train, x_test, y_train, y_test, y_train_t, y_test_t, var_stratify_train

  else:
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9,  random_state = 42)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)
  
  return x_train, x_test, y_train, y_test, y_train_t, y_test_t

def busqueda_hiperparametros(x_train, y_train, var_stratify_train, estimator,param_grid = [] , scoring_med = 'neg_median_absolute_error', transf=False):

  if len(var_stratify_train) != 0:
    
    print('Seccion Stratify')
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)
    stratified = sss.split(x_train, var_stratify_train)
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, cv = stratified, scoring = scoring_grid , refit = 'metrica',  return_train_score=True, n_jobs=-1, verbose=8)
    return grid.fit(x_train, y_train)
  
  else:
    print('OutStratify')
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring = scoring_grid, refit = 'metrica', return_train_score=True, n_jobs=-1, verbose=8 )
    return grid.fit(X_train, y_train)

def metricas(model,y_train, p_train, y_test, p_test): 

  resultados = {
      'parametros' : [model.get_params()['steps'][1][1]],
      'MeAE_train' : [median_absolute_error(y_train,p_train)],
      'MeAE_test'  : [median_absolute_error(y_test, p_test)],
      'MAPE_train' : [mean_absolute_percentage_error(y_train,p_train)],
      'MAPE_test'  : [mean_absolute_percentage_error(y_test, p_test)],
      'r2_train'   : [r2_score(y_train,p_train)],
      'r2_test'    : [r2_score(y_test, p_test)],
      'rmse_train' : [mean_squared_error(y_train,p_train,squared = False)],
      'rmse_test'  : [mean_squared_error(y_test,p_test,squared = False)]
  }
  return pd.DataFrame(resultados)

**División Covariables  y Variable Objetivo**

In [None]:
data_model=df_propiedades.copy()
varibles_considerar = ['tipo','precio','area_m2','banos','garajes','antiguedad','estrato','ubicacion','tipo_propiedad','balcon','zonas_verdes','en_conjunto_cerrado','zona_infantil','supermercados_ccomerciales','colegios_universidades','trans_publico_cercano']
data_model = data_model.loc[:,varibles_considerar]
data_model.shape

(2481, 16)

In [None]:
X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

**OneHotEncoder Variables Categóricas**

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','ubicacion','antiguedad']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','ubicacion','antiguedad']))
data_aux['fila'] = range(0, X.shape[0])
data_aux.head()

X.drop(['tipo','tipo_propiedad','antiguedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)
X.shape

(2481, 33)

_______________________________________________________________

Información de la columna **Descripción**

In [None]:
#Descargar stopwords español
!wget https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt

--2021-10-20 01:40:45--  https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4244 (4.1K) [text/plain]
Saving to: ‘spanish.txt’


2021-10-20 01:40:46 (53.0 MB/s) - ‘spanish.txt’ saved [4244/4244]



In [None]:
import re

data_text=pd.DataFrame(df_propiedades["descripcion"])
data_text

#Limpieza básica del texto
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.lower())
data_text['descripcion'] = data_text['descripcion'].apply((lambda x: re.sub(r'[0-9]+','',x)))

In [None]:
#Corregir error de encoding en textos
def replace_char(text):
  fix_text=text.replace('ã¡','a').replace('ã³','o').replace('ã±','ñ').replace('ã\x81','a').replace('ãº','u').replace('ã©','e').replace('ã\xad','i').replace('ã\x8d','i')
  return fix_text

data_text['descripcion'] = data_text['descripcion'].apply(replace_char)

Estematización o steaming ( realiza con algoritmos más sencillos utiliza el nltk ). Consultar como hacer el steaming con Español. La idea es llevar las palabras a su raiz. 

---



In [None]:
synopses = data_text['descripcion']

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish')
synopsesStem = []
for sentence in synopses:
     synopsesStem.append(" ".join([stemmer.stem(word) for word in sentence.split()]))
#define vectorizer parameters
count_vectorizer = CountVectorizer(max_df=1.0, max_features=200,
                                 min_df=0.1)

count_matrix = count_vectorizer.fit_transform(synopsesStem) #fit the vectorizer to synopses
count_vectorizer.get_feature_names()

['abiert',
 'acces',
 'al',
 'alcob',
 'alcobas',
 'ampli',
 'apartament',
 'are',
 'balcon',
 'bañ',
 'baño',
 'baños',
 'buen',
 'cas',
 'centr',
 'cerc',
 'cerr',
 'closet',
 'cocin',
 'comedor',
 'comerciales',
 'con',
 'constru',
 'cuart',
 'cubiert',
 'cuent',
 'de',
 'del',
 'dos',
 'el',
 'en',
 'esta',
 'excelent',
 'finc',
 'garaj',
 'gas',
 'habit',
 'habitacion',
 'habitaciones',
 'hermos',
 'horas',
 'integral',
 'jueg',
 'la',
 'las',
 'los',
 'lot',
 'mas',
 'minut',
 'muy',
 'nivel',
 'niveles',
 'par',
 'parqu',
 'parqueader',
 'pati',
 'pis',
 'por',
 'porteri',
 'principal',
 'publico',
 'red',
 'rionegr',
 'rionegro',
 'rop',
 'ropas',
 'sal',
 'sala',
 'salon',
 'san',
 'se',
 'sector',
 'servici',
 'social',
 'sol',
 'tien',
 'transport',
 'ubic',
 'ubicacion',
 'un',
 'una',
 'unid',
 'util',
 'valor',
 'vend',
 'vent',
 'verdes',
 'vestier',
 'vist',
 'zon']

In [None]:
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("alcobas", "alcoba"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("closets", "closet"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("amplia", "ampliar"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("amplias", "ampliar"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("amplio", "ampliar"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("amplios", "ampliar"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("ampliars", "ampliar"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("zonas", "zona"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("apartamentos", "apto"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("apartamento", "apto"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("aptos", "apto"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("mt", "mts"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("metros cuadrados", "mts"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("m2", "mts"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("balcones", "balcon"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("baños", "baño"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("iglesias", "iglesia"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("espacio", "espacios"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("habitaciones", "habitacion"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("espacio", "espacios"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("niveles", "nivel"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("publico", "publicos"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("hermosa", "hermoso"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("hermosas", "hermoso"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("hermosos", "hermoso"))
data_text['descripcion'] = data_text['descripcion'].apply(lambda x: x.replace("ropas", "ropa"))

In [None]:
data_text['descripcion'][7]

'excelente oportunidad de inversion  casa ubicada en san antonio de pereira,  uno de los mejores sitios para vivir en el oriente ant.  tres pisos  sala-comedor,  cocina integral,  star para sala tv,  tres habitacion,  tres baño,  balcon,  garaje,  zona de ropa  cerca al parque principal de san antonio de pereira,  supermercados,  iglesia,  farmacia,  transporte,  clinica somer,  comfama parque recreativo.  aeropuerto y tunel de oriente a  minutos. .'

In [None]:
stopwords = pd.read_csv('spanish.txt', header = None, sep='\n')
stopwords_list = list(stopwords[0])

In [None]:
#define vectorizer parameters
count_vectorizer = CountVectorizer(max_features=500, stop_words=stopwords_list, ngram_range=(1,2))
count_matrix = count_vectorizer.fit_transform(data_text['descripcion']) 

print(count_matrix.shape)

(2481, 500)


In [None]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1]])

In [None]:
labels=count_vectorizer.get_feature_names()
df_vectorizer= pd.DataFrame(count_matrix.toarray(), columns=labels)
df_vectorizer.head()

Unnamed: 0,abierta,abierta integral,acabados,acceso,acceso vias,acueducto,ademas,administracion,administracion predial,adultos,aeropuerto,aeropuerto internacional,agua,aire,aire libre,alcoba,alcoba alcoba,alcoba baño,alcoba closet,alcoba principal,alcoba servicio,alcoba vestier,alta,alta valorizacion,alto,ambiente,americana,ampliar,ampliar espaciosss,ampliar zona,antioqueño,antioquia,antonio,antonio pereira,anual,aprox,apto,apto ubicado,apto unidad,apto venta,...,vende,vendo,venta,venta rionegro,venta sector,venta ubicada,venta ubicado,verde,verdes,verdes salon,vereda,vestier,vestier baño,vestier closet,via,via principal,vias,vias acceso,vias transporte,viboral,vicente,vigilancia,vigilancia horas,visitantes,vista,vista panoramica,viva,vivienda,vivir,whatsapp,zona,zona alta,zona bbq,zona comerciales,zona comunes,zona mascotas,zona residencial,zona ropa,zona verde,zona verdes
0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,1,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0


In [None]:
df_vectorizer.rename(columns={'ubicacion':'ubicacion_v'}, inplace=True)

In [None]:
X_v = pd.concat([X,df_vectorizer], axis=1)

In [None]:
# y = data_model['precio']
# X = X_v.copy()
# X.reset_index(inplace=True)

In [None]:
# X.shape, y.shape

((2481, 502), (2481,))

**División train y Test**

In [None]:
X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t, var_stratify_train = div_train_test(X_v, y, var_stratify = 'ubicacion')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### **Random Forest**

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

####**Sin transformar la Y**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe, param_grid = para_grid, transf=False)
modelo_rf

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f438db0cc50>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error),
                      'meae': make_scorer(median_absolute_error),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(mean_squared_error)},
             verbose=8)

In [None]:
metricas_rf = metricas(modelo_rf.best_estimator_,Y_train, modelo_rf.predict(X_train), Y_test, modelo_rf.predict(X_test))
metricas_rf

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53988180.0,76014110.0,0.162373,0.224753,0.911851,0.77491,167536200.0,302456100.0


In [None]:
modelo_rf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=2,
                                       min_samples_split=4, n_estimators=120,
                                       random_state=42))])

In [None]:
dd=pd.DataFrame(modelo_rf.cv_results_)
dd[dd['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
82,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 2...",-80620110.0,9639800.0,-53747440.0,1860704.0,80620110.0,9639800.0,108,53747440.0,1860704.0,0.732749,0.036608,10,0.916485,0.002845,9.093202e+16,1.381852e+16,103,2.636061e+16,1014820000000000.0,0.238376,0.015532,101,0.156307,0.002863


####**Usando Y transformada**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe, param_grid = para_grid, transf = True)
modelo_rf_t

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f438d8ffdd0>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error2),
                      'meae': make_scorer(median_absolute_error2),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score2),
                      'rmse': make_scorer(mean_squared_error2)},
             verbose=8)

In [None]:
metricas_rf_t = metricas(modelo_rf_t.best_estimator_,Y_train, np.expm1(modelo_rf_t.predict(X_train)), Y_test, np.expm1(modelo_rf_t.predict(X_test)))
metricas_rf_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",48332780.0,79998590.0,0.143553,0.215558,0.873035,0.749467,201067000.0,319092200.0


In [None]:
modelo_rf_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=2,
                                       min_samples_split=6, n_estimators=150,
                                       random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rf_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
89,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 2...",-0.162685,0.014114,-0.103013,0.002001,77487710.0,10307090.0,91,48705140.0,1122655.0,0.704569,0.035072,13,0.879852,0.003395,316661400.0,24315880.0,96,194706500.0,2871208.0,0.219502,0.011888,101,0.139657,0.001694


### **Gradient Boosting Regression**

####**Sin tranformar la Y**

In [None]:
pipe_gbt = Pipeline(steps = [('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor(random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}

modelo_gbt = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_gbt, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt = metricas(modelo_gbt.best_estimator_,Y_train, modelo_gbt.predict(X_train), Y_test, modelo_gbt.predict(X_test))
metricas_gbt

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,14602580.0,69267770.0,0.043116,0.205104,0.997325,0.784279,29184900.0,296094300.0


In [None]:
modelo_gbt.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=9, min_samples_leaf=2,
                                           min_samples_split=4,
                                           random_state=42))])

####**Usando Y transformada**

**Búsqueda de hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}
modelo_gbt_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_gbt, param_grid = para_grid, transf=True )

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt_t = metricas(modelo_gbt_t.best_estimator_,Y_train, np.expm1(modelo_gbt_t.predict(X_train)), Y_test, np.expm1(modelo_gbt_t.predict(X_test)))
metricas_gbt_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,12832400.0,72486440.0,0.037125,0.190701,0.993512,0.784236,45452100.0,296124100.0


In [None]:
modelo_gbt_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=8, min_samples_leaf=3,
                                           min_samples_split=4,
                                           n_estimators=150,
                                           random_state=42))])

### **XG Boost**

#### Sin transformar la Y

In [None]:
import xgboost as xgb
# pipe_xgb = Pipeline(steps = [
#     ('scaler', StandardScaler()), 
#     ('xgb',  xgb.XGBRegressor(
#         objective='reg:squarederror', reg_alpha = 3, reg_lambda = 3, max_depth=5, learning_rate=0.05, n_jobs=-1, n_estimators = 175, 
#     ))
# ])

pipe_xgb = Pipeline(steps = [
    ('scaler', StandardScaler()), 
    ('xgb',  xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, 
    ))
])

In [1]:
import xgboost as xgb
xgb.XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

**Búsqueda de Hiperparámetros**

In [None]:
para_grid = { 'xgb__max_depth':[6,7,8,9], 'xgb__learning_rate':[0.05,0.1], 'xgb__reg_alpha':[0.01,0.1,0.5]}
#'xgb__n_estimators':[100,120,150],
modelo_xgb = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_xgb, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
metricas_xgb = metricas(modelo_xgb.best_estimator_,Y_train, modelo_xgb.predict(X_train), Y_test, modelo_xgb.predict(X_test))
metricas_xgb

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(max_depth=9, n_jobs=-1, objective...",17400506.0,77883712.0,0.051813,0.212672,0.99546,0.786205,38020070.0,294769200.0


In [None]:
modelo_xgb.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=9, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

### **Red Neuronal**

####**Sin tranformar la Y**

In [None]:
# from sklearn.neural_network import MLPRegressor
# pipe_rnn = Pipeline(steps = [('scaler', StandardScaler()), ('rnn', MLPRegressor(activation='relu',max_iter=100,random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': [0.001,0.004,0.01,0.1],
#     'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
#     'rnn__max_iter':[200,500]}

# modelo_rnn = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_rnn, param_grid = para_grid,transf=False )

In [None]:
# metricas_rnn = metricas(modelo_rnn.best_estimator_,Y_train, modelo_rnn.predict(X_train), Y_test, modelo_rnn.predict(X_test))
# metricas_rnn

In [None]:
# modelo_rnn.best_estimator_

####**Usando Y transformada**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': [0.001,0.004,0.01,0.1],
#     'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
#     'rnn__max_iter':[200,500]}
    
# modelo_rnn_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_rnn, param_grid = para_grid, transf=True )

In [None]:
# metricas_rnn_t = metricas(modelo_rnn_t.best_estimator_,Y_train, np.expm1(modelo_rnn_t.predict(X_train)), Y_test, np.expm1(modelo_rnn_t.predict(X_test)))
# metricas_rnn_t

In [None]:
# modelo_rnn_t.best_estimator_

Referencia: https://www.cienciadedatos.net/documentos/py35-redes-neuronales-python.html

## Resumen

In [None]:
from google.colab import drive

drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados


In [None]:
import os
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf = cross_validation_rf[cross_validation_rf['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft = cross_validation_rft[cross_validation_rft['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt = cross_validation_gbt[cross_validation_gbt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt = cross_validation_gbtt[cross_validation_gbtt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_xgb=pd.DataFrame(modelo_xgb.cv_results_)
cross_validation_xgb = cross_validation_xgb[cross_validation_xgb['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_xgb['escenario']='modelo_xgb'

# cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
# cross_validation_rnn = cross_validation_rnn[cross_validation_rnn['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
# cross_validation_rnn['escenario']='modelo_rnn'

# cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
# cross_validation_rnnt = cross_validation_rnnt[cross_validation_rnnt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
# cross_validation_rnnt['escenario']='modelo_rnnt'

# resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt, cross_validation_xgb,cross_validation_rnn,cross_validation_rnnt],
#                             ignore_index = True)
resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt, cross_validation_xgb],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross['params'])

resultado_cross.drop(['params'],axis=1, inplace=True)
resultado_cross = resultado_cross.drop_duplicates()
resultado_cross.reset_index(inplace=True, drop=False)
resultado_cross

Unnamed: 0,index,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,-80620110.0,9639800.0,-53747440.0,1860704.0,53747440.0,80620110.0,0.156307,0.238376,0.916485,0.732749,2.636061e+16,9.093202e+16,modelo_rf
1,1,-0.1626855,0.01411398,-0.1030125,0.002000749,48705140.0,77487710.0,0.139657,0.219502,0.879852,0.704569,194706500.0,316661400.0,modelo_rft
2,2,-64980010.0,9909172.0,-12507820.0,466540.7,12507820.0,64980010.0,0.03814,0.221489,0.99788,0.721093,669363900000000.0,9.536301e+16,modelo_gbt
3,3,-0.1344611,0.01503517,-0.02087851,0.0005742371,10754550.0,61437900.0,0.032584,0.196438,0.995192,0.733631,38942340.0,300540800.0,modelo_gbtt
4,6,-65963370.0,7677518.0,-15209200.0,494883.4,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb
5,7,-65963370.0,7677518.0,-15209200.0,494888.6,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb
6,8,-65963370.0,7677518.0,-15209200.0,494883.4,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb


In [None]:
# resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_xgb,metricas_rnn, metricas_rnn_t],
#                             ignore_index = True)
resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_xgb],ignore_index = True)
resultado_metricas.drop_duplicates(inplace=True)
resultado_metricas.reset_index(inplace=True, drop=False)
resultado_metricas

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53988180.0,76014110.0,0.162373,0.224753,0.911851,0.77491,167536200.0,302456100.0
1,1,"(DecisionTreeRegressor(max_depth=9, max_featur...",48332780.0,79998590.0,0.143553,0.215558,0.873035,0.749467,201067000.0,319092200.0
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,14602580.0,69267770.0,0.043116,0.205104,0.997325,0.784279,29184900.0,296094300.0
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,12832400.0,72486440.0,0.037125,0.190701,0.993512,0.784236,45452100.0,296124100.0
4,4,"XGBRegressor(max_depth=9, n_jobs=-1, objective...",17400510.0,77883710.0,0.051813,0.212672,0.99546,0.786205,38020070.0,294769200.0


In [None]:
resultados =  pd.concat([resultado_metricas, resultado_cross], axis=1)
resultados

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test,index.1,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0.0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53988180.0,76014110.0,0.162373,0.224753,0.911851,0.77491,167536200.0,302456100.0,0,-80620110.0,9639800.0,-53747440.0,1860704.0,53747440.0,80620110.0,0.156307,0.238376,0.916485,0.732749,2.636061e+16,9.093202e+16,modelo_rf
1,1.0,"(DecisionTreeRegressor(max_depth=9, max_featur...",48332780.0,79998590.0,0.143553,0.215558,0.873035,0.749467,201067000.0,319092200.0,1,-0.1626855,0.01411398,-0.1030125,0.002000749,48705140.0,77487710.0,0.139657,0.219502,0.879852,0.704569,194706500.0,316661400.0,modelo_rft
2,2.0,([DecisionTreeRegressor(criterion='friedman_ms...,14602580.0,69267770.0,0.043116,0.205104,0.997325,0.784279,29184900.0,296094300.0,2,-64980010.0,9909172.0,-12507820.0,466540.7,12507820.0,64980010.0,0.03814,0.221489,0.99788,0.721093,669363900000000.0,9.536301e+16,modelo_gbt
3,3.0,([DecisionTreeRegressor(criterion='friedman_ms...,12832400.0,72486440.0,0.037125,0.190701,0.993512,0.784236,45452100.0,296124100.0,3,-0.1344611,0.01503517,-0.02087851,0.0005742371,10754550.0,61437900.0,0.032584,0.196438,0.995192,0.733631,38942340.0,300540800.0,modelo_gbtt
4,4.0,"XGBRegressor(max_depth=9, n_jobs=-1, objective...",17400510.0,77883710.0,0.051813,0.212672,0.99546,0.786205,38020070.0,294769200.0,6,-65963370.0,7677518.0,-15209200.0,494883.4,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb
5,,,,,,,,,,,7,-65963370.0,7677518.0,-15209200.0,494888.6,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb
6,,,,,,,,,,,8,-65963370.0,7677518.0,-15209200.0,494883.4,15209200.0,65963370.0,0.045917,0.219669,0.996485,0.742726,1110039000000000.0,8.761787e+16,modelo_xgb


In [None]:
resultados.to_csv('resultados_train_test_general_basica_texto.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_xgb=pd.DataFrame(modelo_xgb.cv_results_)
cross_validation_xgb['escenario']='modelo_xgb'

# cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
# cross_validation_rnn['escenario']='modelo_rnn'

# cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
# cross_validation_rnnt['escenario']='modelo_rnnt'

# resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_xgb,cross_validation_rnn,cross_validation_rnnt],
#                             ignore_index = True)
resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_xgb],ignore_index = True)

columna = pd.DataFrame(resultado_cross_total['params'])

resultado_cross_total.drop(['params'],axis=1, inplace=True)
resultado_cross_total = resultado_cross_total.drop_duplicates()
resultado_cross_total.reset_index(inplace=True, drop=False)

In [None]:
resultado_cross_total.to_csv('resultados_crossvalidacion_general_basica_texto.csv',sep=";",decimal=",")

--------------------------------------------------------------------------