<a href="https://colab.research.google.com/github/avanegasp/talleres-DS-Uc/blob/main/Copia_de_Temas_avanzados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src = "https://github.com/HarryVargas96/UdeCataluna/blob/main/logo_uc_grande.png?raw=true" alt = "Encabezado" width = "100%">  </img>

## 1. Dependencias

In [None]:
import joblib

# Librerías usuales
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#
import sklearn
from sklearn.model_selection import (train_test_split, GridSearchCV, RandomizedSearchCV,
                                     RepeatedStratifiedKFold, cross_validate)

# Assemble pipeline(s)
from sklearn import set_config
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder

# Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
set_config(display="diagram")

In [None]:
!python --version
print('NumPy', np.__version__)
print('Pandas', pd.__version__)
print('Sci-kit learn',sklearn.__version__)

Python 3.10.12
NumPy 1.23.5
Pandas 1.5.3
Sci-kit learn 1.2.2


## 2. Lectura de datos

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Lectura de archivo crudo
df = pd.read_csv(
    'https://github.com/HarryVargas96/UdeCataluna/blob/main/data/airbnb.csv?raw=true',
    delimiter=',')

## 3. Limpieza de datos

In [None]:
# Filtro de datos erróneos
df = df[df['price'] > 0].copy()
# Selección de características
df = df[
    ['price','neighbourhood','latitude','longitude', 'property_type'
    ,'room_type','bathrooms','bedrooms','beds','host_is_superhost','parking' ]
    ]
# Limpieza de variable
df['parking'].replace({-1.0 : 0.0 } , inplace = True)

In [None]:
# Cambio de tipología
df = df.astype(
    {
        'host_is_superhost':'object',
        'parking': 'object'
    }
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30173 entries, 0 to 30178
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   price              30173 non-null  int64   
 1   neighbourhood      30164 non-null  object  
 2   latitude           30173 non-null  float64 
 3   longitude          30173 non-null  float64 
 4   property_type      30173 non-null  object  
 5   room_type          30173 non-null  object  
 6   bathrooms          30173 non-null  float64 
 7   bedrooms           30173 non-null  int64   
 8   beds               30173 non-null  int64   
 9   host_is_superhost  30164 non-null  category
 10  parking            30173 non-null  category
dtypes: category(2), float64(3), int64(3), object(3)
memory usage: 2.4+ MB


## 4. Partición prueba y entrenamiento

In [None]:
X = df.drop(columns = 'price')
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.3,
    random_state = 0
  )

## 5. Pipelines preprocesamiento

In [None]:
# Columnas numéricas
numeric_cols = list(X_train.select_dtypes(include=np.number).columns)
# Columnas categóricas
cat_cols = list(X_train.select_dtypes(include=['object']).columns)

In [None]:
numeric_cols

['latitude', 'longitude', 'bathrooms', 'bedrooms', 'beds']

In [None]:
cat_cols

['neighbourhood', 'property_type', 'room_type', 'host_is_superhost', 'parking']

In [None]:
# Pipeline para variables numéricas
pipeline_numerico = Pipeline(
    [('imputador', SimpleImputer(strategy = 'median')),
     ('escalador', StandardScaler())]
     )

pipeline_numerico

In [None]:
pipeline_numerico.fit_transform(X_train[['latitude','longitude']])

array([[ 0.06222692,  4.69670681],
       [-0.8680187 ,  0.89020859],
       [ 1.8564551 ,  1.10574337],
       ...,
       [-0.33663282, -0.14603375],
       [ 0.9115657 ,  0.10210293],
       [-0.65806362,  0.61082841]])

In [None]:
# Pipeline para categóricas con Dummy
pipeline_categorico = Pipeline(
    [('imputador', SimpleImputer(strategy='most_frequent')),
     ('codificador', OneHotEncoder(drop='first', handle_unknown='ignore'))]
    )

pipeline_categorico

In [None]:
# Unir las transformaciones previas en un Pipeline más grande

preprocesamiento = ColumnTransformer(
    [('variables_numericas', pipeline_numerico, numeric_cols),
     ('variables_categoricas', pipeline_categorico, cat_cols)])
preprocesamiento

## 6. Pipeline Modelamiento

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
model_pipe = Pipeline(
    [('preprocesamiento', preprocesamiento),
     ('regresor', KNeighborsRegressor())]
     )

model_pipe

### GridSearchCV

In [None]:
np.logspace(-10,10, num = 10)

array([1.00000000e-10, 1.66810054e-08, 2.78255940e-06, 4.64158883e-04,
       7.74263683e-02, 1.29154967e+01, 2.15443469e+03, 3.59381366e+05,
       5.99484250e+07, 1.00000000e+10])

In [None]:
param_grid = [
              {'regresor': [KNeighborsRegressor()],
               'regresor__n_neighbors': np.linspace(1, 30, num = 30, dtype=int)#30
              },
              {'regresor': [DecisionTreeRegressor(random_state=0)],
               'regresor__max_depth': np.linspace(1, 30, num = 30, dtype=int)# 30
              },
              {'regresor': [RandomForestRegressor(n_jobs=-1, random_state=0)],
               'regresor__n_estimators': np.linspace(100, 400, num = 4, dtype=int),
               'regresor__max_depth': np.linspace(1, 10, num = 4, dtype=int),
               'regresor__max_features': np.linspace(1, 10, num = 3, dtype=int),# 48
               'regresor__min_samples_leaf': [5]
              },
              {'regresor': [LinearRegression()] # 1
              }
              ] # 109 experimentos

scores = [
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    'r2'
    ]

In [None]:
grid_search = GridSearchCV(
    model_pipe,
    param_grid,
    cv=5,# 545 experimentos
    scoring=scores,
    refit= 'neg_root_mean_squared_error' ,
    n_jobs=-1,
    verbose = 3,
    )
grid_search

Esta celda tardó 28 minutos en ejecutarse

In [None]:
%%time
grid_search.fit(X_train,y_train)
joblib.dump(grid_search,
            '/content/drive/MyDrive/Diplomado big data/model_grid.joblib')

In [None]:
grid_search = joblib.load('/content/drive/MyDrive/Diplomado big data/model_grid.joblib')

In [None]:
random_grid_search = RandomizedSearchCV(
    model_pipe,
    param_grid,
    cv=5,# 545 experimentos
    scoring=scores,
    refit= 'neg_root_mean_squared_error' ,
    n_jobs=-1,
    verbose = 3,
    n_iter = 3
    )
random_grid_search

In [None]:
%%time
random_grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: user 1 s, sys: 83.7 ms, total: 1.09 s
Wall time: 21.1 s


In [None]:
dir(random_grid_search)
# best_estimator_
#  'best_params_',
#  'best_score_',
#  'classes_',
#  'cv',
#  'cv_results_'

In [None]:
random_grid_search.cv_results_

In [None]:
results = pd.DataFrame(random_grid_search.cv_results_)

In [None]:
pd.set_option('display.max_columns', 500)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regresor__n_estimators,param_regresor__min_samples_leaf,param_regresor__max_features,param_regresor__max_depth,param_regresor,params,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,split3_test_neg_root_mean_squared_error,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error,split0_test_neg_mean_absolute_error,split1_test_neg_mean_absolute_error,split2_test_neg_mean_absolute_error,split3_test_neg_mean_absolute_error,split4_test_neg_mean_absolute_error,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
0,3.248019,0.139111,0.306036,0.042577,400.0,5.0,5.0,10,"RandomForestRegressor(n_jobs=-1, random_state=0)","{'regresor__n_estimators': 400, 'regresor__min...",-79.151628,-76.964191,-78.791901,-79.7363,-77.15916,-78.360636,1.104329,2,-58.502332,-57.166423,-58.116618,-58.898313,-57.162909,-57.969319,0.701964,3,0.283589,0.292568,0.281565,0.290615,0.302228,0.290113,0.00733,2
1,2.202196,0.655153,0.054397,0.019793,,,,24,"DecisionTreeRegressor(max_depth=14, random_sta...","{'regresor__max_depth': 24, 'regresor': Decisi...",-78.592521,-78.137231,-78.789622,-79.552119,-78.767257,-78.76775,0.45694,3,-49.801526,-50.61824,-51.017233,-50.503099,-49.941341,-50.376288,0.448322,2,0.293674,0.270839,0.281607,0.293888,0.27284,0.28257,0.009845,3
2,0.735218,0.26675,0.031403,0.006013,,,,14,"DecisionTreeRegressor(max_depth=14, random_sta...","{'regresor__max_depth': 14, 'regresor': Decisi...",-70.34609,-70.590718,-71.301207,-70.833859,-68.733324,-70.36104,0.872915,1,-43.988788,-44.571859,-44.727981,-44.279177,-42.982466,-44.110054,0.618039,1,0.434122,0.404883,0.411674,0.440176,0.446301,0.427431,0.016248,1


In [None]:
random_grid_search

In [None]:
random_grid_search.predict(X_test)



array([118.73333333,  63.84534271, 176.85714286, ...,  76.50202429,
       122.65517241, 377.53333333])

In [None]:
grid_search.best_score_

-60.34965974456511

In [None]:
grid_search.best_estimator_['regresor'].get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 7,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

In [None]:
pd.DataFrame(grid_search.cv_results_).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 35 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            109 non-null    float64
 1   std_fit_time                             109 non-null    float64
 2   mean_score_time                          109 non-null    float64
 3   std_score_time                           109 non-null    float64
 4   param_regresor                           109 non-null    object 
 5   param_regresor__n_neighbors              30 non-null     object 
 6   param_regresor__max_depth                78 non-null     object 
 7   param_regresor__max_features             48 non-null     object 
 8   param_regresor__min_samples_leaf         48 non-null     object 
 9   param_regresor__n_estimators             48 non-null     object 
 10  params                                   109 non-n

In [None]:
pd.DataFrame(grid_search.cv_results_).nlargest(10,'mean_test_neg_root_mean_squared_error')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regresor,param_regresor__n_neighbors,param_regresor__max_depth,param_regresor__max_features,param_regresor__min_samples_leaf,param_regresor__n_estimators,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
36,0.205031,0.007296,0.027261,0.001213,"DecisionTreeRegressor(max_depth=7, random_stat...",,7.0,,,,...,0.479057,1,0.564383,0.574213,0.575815,0.598274,0.581285,0.578794,0.011164,1
20,0.093472,0.01602,15.440704,1.187727,KNeighborsRegressor(),21.0,,,,,...,0.571323,9,0.564045,0.56518,0.562648,0.591043,0.598429,0.576269,0.015279,2
19,0.087847,0.007705,14.494968,0.165982,KNeighborsRegressor(),20.0,,,,,...,0.600627,10,0.565235,0.565524,0.561936,0.590483,0.597777,0.576191,0.014881,3
15,0.139526,0.03396,15.179213,1.516705,KNeighborsRegressor(),16.0,,,,,...,0.660137,5,0.566265,0.566918,0.560692,0.587861,0.599128,0.576173,0.014745,4
21,0.11582,0.040407,14.173107,0.320685,KNeighborsRegressor(),22.0,,,,,...,0.549013,12,0.564478,0.564294,0.562585,0.590932,0.598039,0.576065,0.015221,5
18,0.083013,0.001134,14.543803,0.205711,KNeighborsRegressor(),19.0,,,,,...,0.66646,11,0.56467,0.565648,0.561591,0.588665,0.599386,0.575992,0.015169,6
16,0.140394,0.039536,14.258273,0.29865,KNeighborsRegressor(),17.0,,,,,...,0.68767,8,0.564978,0.565557,0.55947,0.589866,0.599573,0.575889,0.015822,7
14,0.145024,0.032776,14.03927,0.156281,KNeighborsRegressor(),15.0,,,,,...,0.675684,7,0.565526,0.56604,0.560312,0.587934,0.599239,0.57581,0.015082,8
17,0.093734,0.019132,14.470613,0.06748,KNeighborsRegressor(),18.0,,,,,...,0.693659,15,0.565492,0.563811,0.559146,0.58953,0.60004,0.575604,0.016145,9
28,0.082988,0.000629,14.512605,0.101326,KNeighborsRegressor(),29.0,,,,,...,0.422409,22,0.563467,0.562789,0.565956,0.59012,0.594212,0.575309,0.013865,10


In [None]:
y_predict = grid_search.best_estimator_.predict(X_test)



In [None]:
grid_search.best_estimator_['regresor'].feature_importances_

array([1.03132217e-01, 1.51937022e-01, 5.80773386e-02, 1.34687699e-01,
       1.10509870e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

### Random Search CV

In [None]:
param_grid_xgb = [
              {'regresor': [XGBRegressor(random_state = 0)],
               'regresor__xgb__learning_rate': [0.1],
               'regresor__xgb__max_depth': [7, 10, 15, 20],
               'regresor__xgb__min_child_weight': [10, 15, 20, 25],
               'regresor__xgb__colsample_bytree': [0.8, 0.9, 1],
               'regresor__xgb__n_estimators': [300, 400, 500, 600],
               'regresor__xgb__reg_alpha': [0.5, 0.2, 1],
               'regresor__xgb__reg_lambda': [2, 3, 5],
               'regresor__xgb__gamma': [1, 2, 3] #5184

              }]

In [None]:
grid_search_random = RandomizedSearchCV(model_pipe,
                                        param_grid_xgb,
                                        n_iter=10,
                                        cv=5,
                                        scoring=scores,
                                        random_state=0,
                                        refit= 'neg_root_mean_squared_error' ,
                                        n_jobs=-1
                           )
grid_search_random

Esta celda se demora 13 minutos en ejecutar

In [None]:
%%time
grid_search_random.fit(X_train,y_train)
joblib.dump(grid_search_random,
            '/content/drive/MyDrive/Diplomado big data/model_random.joblib')

Parameters: { "xgb__colsample_bytree", "xgb__gamma", "xgb__learning_rate", "xgb__max_depth", "xgb__min_child_weight", "xgb__n_estimators", "xgb__reg_alpha", "xgb__reg_lambda" } are not used.

CPU times: user 3.93 s, sys: 99.2 ms, total: 4.03 s
Wall time: 1min 1s


['/content/drive/MyDrive/Diplomado big data/model_random.joblib']

In [None]:
grid_search_random = joblib.load('/content/drive/MyDrive/Diplomado big data/model_random.joblib')

In [None]:
grid_search_random.get_params

<bound method BaseEstimator.get_params of RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocesamiento',
                                              ColumnTransformer(transformers=[('numerica',
                                                                               Pipeline(steps=[('imputador',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('escalador',
                                                                                                StandardScaler())]),
                                                                               ['latitude',
                                                                                'longitude',
                                                                                'bathrooms',
                                

In [None]:
grid_search_random.best_score_

-58.25446611235854

In [None]:
pd.DataFrame(grid_search_random.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regresor__xgb__reg_lambda,param_regresor__xgb__reg_alpha,param_regresor__xgb__n_estimators,param_regresor__xgb__min_child_weight,param_regresor__xgb__max_depth,param_regresor__xgb__learning_rate,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2
0,2.57084,0.511899,0.064739,0.02308,5,0.2,600,25,15,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
1,1.941262,0.005662,0.047774,0.00323,2,1.0,400,10,15,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
2,2.520787,0.62107,0.065453,0.024592,2,1.0,600,15,20,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
3,1.969217,0.042414,0.04422,0.000822,2,1.0,500,20,15,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
4,2.158622,0.418407,0.047483,0.00665,5,1.0,600,10,15,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
5,2.333217,0.519604,0.048765,0.010274,5,1.0,600,20,10,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
6,1.959156,0.037619,0.046879,0.004995,3,1.0,500,10,20,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
7,2.516839,0.526957,0.06242,0.022144,5,1.0,400,15,15,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
8,1.943271,0.016087,0.043661,0.001145,2,0.2,400,10,7,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
9,2.326292,0.468734,0.052117,0.017535,2,0.2,500,25,7,0.1,...,0.471075,1,0.602252,0.586442,0.606108,0.622827,0.619649,0.607456,0.01308,1
