In [1]:
import pandas as pd
import numpy as np
import os
from ipynb.fs.full.utils import save_df, load_df
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from numpy import savetxt

In [2]:
def load_transformation(path):
    return load_df(path)

In [4]:
path = 'C:/Users/diego172/Documents/Diego/ITAM/Clases/Mineria/Proyecto_1/Estructura_1_Prueba/outputs/transformation_df.pkl'

In [5]:
df = load_transformation(path)

In [7]:
c5_inputs = df.drop(['latitud', 'longitud', 'codigo_cierre','fecha_creacion','incidente_c4'], axis=1)

In [9]:
# sen cos transformation
seconds_in_day = 24*60*60

c5_inputs['sin_time'] = np.sin(2*np.pi*(c5_inputs.hora_creacion.dt.hour*60*60+c5_inputs.hora_creacion.dt.minute*60+c5_inputs.hora_creacion.dt.second)/seconds_in_day)
c5_inputs['cos_time'] = np.cos(2*np.pi*(c5_inputs.hora_creacion.dt.hour*60*60+c5_inputs.hora_creacion.dt.minute*60+c5_inputs.hora_creacion.dt.second)/seconds_in_day)


In [10]:
c5_inputs = c5_inputs.drop(['hora_creacion',], axis=1)

In [11]:
c5_inputs

Unnamed: 0,dia_semana,delegacion_inicio,clas_con_f_alarma,tipo_entrada,mes,label,sin_time,cos_time
0,Sábado,VENUSTIANO CARRANZA,EMERGENCIA,BOTÓN DE AUXILIO,1,0,-0.362167,0.932113
1,Sábado,CUAJIMALPA,URGENCIAS MEDICAS,BOTÓN DE AUXILIO,1,0,-0.297305,0.954782
2,Domingo,TLALPAN,EMERGENCIA,LLAMADA DEL 066,1,1,0.572921,-0.819611
3,Domingo,MAGDALENA CONTRERAS,EMERGENCIA,LLAMADA DEL 066,1,1,-0.338122,0.941102
4,Domingo,MIGUEL HIDALGO,EMERGENCIA,LLAMADA DEL 066,1,0,0.915750,0.401748
...,...,...,...,...,...,...,...,...
1383133,Jueves,GUSTAVO A. MADERO,URGENCIAS MEDICAS,BOTÓN DE AUXILIO,10,0,-0.900856,-0.434118
1383134,Jueves,VENUSTIANO CARRANZA,URGENCIAS MEDICAS,BOTÓN DE AUXILIO,10,0,-0.922566,-0.385839
1383135,Jueves,AZCAPOTZALCO,EMERGENCIA,LLAMADA DEL 911,10,0,-0.001891,-0.999998
1383136,Jueves,VENUSTIANO CARRANZA,URGENCIAS MEDICAS,LLAMADA DEL 911,10,0,-0.006181,-0.999981


In [12]:
transformers = [('one_hot', OneHotEncoder(), ['dia_semana', 'delegacion_inicio','clas_con_f_alarma','tipo_entrada','mes']),
               ('impute_sin_time', SimpleImputer(strategy="median"), ['sin_time']),
               ('impute_cos_time', SimpleImputer(strategy="median"), ['cos_time'])]

In [13]:
col_trans = ColumnTransformer(transformers, remainder="drop", n_jobs=-1, verbose=True)

In [14]:
col_trans.fit(c5_inputs)

ColumnTransformer(n_jobs=-1,
                  transformers=[('one_hot', OneHotEncoder(),
                                 ['dia_semana', 'delegacion_inicio',
                                  'clas_con_f_alarma', 'tipo_entrada', 'mes']),
                                ('impute_sin_time',
                                 SimpleImputer(strategy='median'),
                                 ['sin_time']),
                                ('impute_cos_time',
                                 SimpleImputer(strategy='median'),
                                 ['cos_time'])],
                  verbose=True)

In [15]:
c5_input_vars = col_trans.transform(c5_inputs)

In [16]:
variance_threshold = VarianceThreshold(threshold=0.1)
c5_input_variance=variance_threshold.fit_transform(c5_input_vars)
#variance_threshold.transform(c5_input_vars)
#variance_threshold.variances_

In [19]:
c5_input_variance

<1382537x14 sparse matrix of type '<class 'numpy.float64'>'
	with 6944642 stored elements in Compressed Sparse Row format>

In [20]:
X = c5_input_variance
y = df.label.values.reshape(c5_input_variance.shape[0],)


In [30]:
import time 

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# ocuparemos un RF
classifier = RandomForestClassifier(oob_score=True, random_state=1234)
# separando en train, test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# definicion de los hiperparametros que queremos probar
hyper_param_grid = {'n_estimators': [50,100], 
                    'max_depth': [1, 5],
                    'min_samples_split': [2, 5]}

# ocupemos grid search!
gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision',
                           cv = 5, 
                           n_jobs = -1)
start_time = time.time()
gs.fit(X, y)
print("Tiempo en ejecutar: ", time.time() - start_time)

Tiempo en ejecutar:  575.5030028820038


In [29]:
gs.cv_results_

{'mean_fit_time': array([106.57745485, 106.39317584, 222.3269485 , 208.70399804]),
 'std_fit_time': array([ 1.05778603,  0.95160789, 15.58546814,  0.94943242]),
 'mean_score_time': array([4.23092461, 4.2154386 , 3.79888592, 3.38418288]),
 'std_score_time': array([0.12164144, 0.16556672, 0.28646182, 0.5854106 ]),
 'param_max_depth': masked_array(data=[1, 1, 5, 5],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 5, 2, 5],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100, 100, 100, 100],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 100},
  {'max_depth': 1, 'min_samples_split': 5, 'n_estimators': 100},
  {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100},
  {

In [31]:
gs.best_params_

{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}

In [32]:
gs.best_score_

0.9386191619506148

In [33]:
gs.best_estimator_

RandomForestClassifier(max_depth=5, oob_score=True, random_state=1234)

In [34]:
gs.best_estimator_.oob_score_

0.8015510615629093