1. Agregar más parámetros a la búsqueda.
2. Devolver los params de grid_search para tener el reporte.

In [1]:
# Custom functions
# ==============================================================================
from funciones import CargarPandasDatasetCategoricos

# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import pprint
import time

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from sklearn import tree #La versión que tengo es 0.24.1 y está disponible apartir de la 0.21

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import f1_score, precision_recall_fscore_support, plot_confusion_matrix
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
def grid(param_grid, X, y):
    #hacer grid search
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1', verbose=1, error_score="raise")
    
    #hacer fit
    grid.fit(X, y)
    
    #imprimir mejores parametros y score
    print(f'\nf1: {grid.best_score_}\nparametros:')
    pprint.pprint(grid.best_params_, width=13)
    print('\n')
    
    return grid.best_estimator_.feature_importances_

In [3]:
def guardarImportanciasBestRF(param_grid, X, y, ruta_archivo):
    textfile = open(ruta_archivo, "w")
    textfile.write(','.join(X.columns) + "\n")
    textfile.write(','.join(map(str, grid(param_grid, X, y))))
    textfile.close()

## Cargar datos

In [4]:
endireh = CargarPandasDatasetCategoricos('datasets/endireh.csv')

In [5]:
endireh.columns

Index(['CVE_ENT', 'CVE_MUN', 'REGION', 'DOMINIO', 'T_INSTRUM', 'PISOS', 'P1_2',
       'P1_2_A', 'FOCOS', 'P1_4_1', 'P1_4_2', 'P1_4_3', 'P1_4_4', 'P1_4_5',
       'P1_4_6', 'P1_4_7', 'P1_4_8', 'P1_4_9', 'AGUA', 'DRENAJE', 'P1_7',
       'P1_9', 'P1_10_1', 'P1_10_2', 'P1_10_3', 'P1_10_4', 'PAREN', 'EDAD',
       'RES_MADRE', 'RES_PADRE', 'ESCOLARIDAD', 'GRA', 'ALFABETISMO',
       'ASISTENCIA_ESC', 'PERT_INDIGENA', 'LENG_INDIGENA', 'LENG_ESPAÑOL',
       'TRABAJO', 'P2_14', 'P2_15', 'P9_1', 'P9_3', 'NACIO_VIV', 'NACIO_MUERT',
       'ABORTO', 'P9_5', 'P9_6', 'P9_7', 'P9_8_11', 'P9_8'],
      dtype='object')

In [8]:
endireh.shape

((15647, 50), (17485, 50))

## Preprocesamiento

### Cada tipo de violencia y 'ambos' sean 1.

In [7]:
endireh.loc[endireh["P9_8"]>0, "P9_8"] = 1

### Obtengo la variable objetivo _y_.

In [9]:
y = atencion_no_autorizada['P9_8'].copy()

## Encontrar los mejores parámetros para nacional

In [10]:
print("Current Time is :", time.strftime("%H:%M:%S", time.localtime()))

Current Time is : 20:44:35


In [11]:
param_grid = ParameterGrid(
                {
                 'max_features'      : [['auto', 'sqrt', 'log2']],
                 'criterion'         : [['gini', 'entropy']],
                 'min_samples_split' : [range(285, 580, 25)],
                 'min_samples_leaf'  : [range(150, 200, 2)],
                 'bootstrap'         : [[False, True]],
                 'n_estimators'      : [[150]],
                 #'max_depth'         : [[6]],
                 'random_state'      : [[5]],
                 'n_jobs'            : [[-1]],
                }
            )


In [12]:
%time guardarImportanciasBestRF(param_grid, atencion_no_autorizada.drop(columns=['REGION', 'P9_8'], inplace=False), y_atencion_no_autorizada, 'FI/Atencion_Nacional_FI.txt')

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 150,
 'min_samples_split': 285,
 'n_estimators': 150,
 'n_jobs': -1,
 'random_state': 5}


CPU times: user 2h 1min 48s, sys: 6min 52s, total: 2h 8min 41s
Wall time: 2h 48min 57s


In [13]:
%time guardarImportanciasBestRF(param_grid, abuso_y_violencia.drop(columns=['REGION', 'P9_8'], inplace=False), y_abuso_y_violencia, 'FI/Abuso_Nacional_FI.txt')

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 150,
 'min_samples_split': 285,
 'n_estimators': 150,
 'n_jobs': -1,
 'random_state': 5}


CPU times: user 2h 9min 28s, sys: 7min 17s, total: 2h 16min 46s
Wall time: 2h 58min 22s


## Encontrar los mejores parámetros por región

In [14]:
atencion_no_autorizada['REGION'].value_counts()

4    3040
1    2631
3    2560
0    2443
6    1601
7    1567
5     962
2     843
Name: REGION, dtype: int64

In [15]:
abuso_y_violencia['REGION'].value_counts()

4    3428
1    2988
3    2843
0    2708
6    1779
7    1728
5    1081
2     930
Name: REGION, dtype: int64

In [16]:
atencion_no_autorizada.drop(columns=['CVE_MUN', 'CVE_ENT'], inplace=True)
gk_atencion = atencion_no_autorizada.groupby('REGION')
abuso_y_violencia.drop(columns=['CVE_MUN', 'CVE_ENT'], inplace=True)
gk_abuso = abuso_y_violencia.groupby('REGION')

In [17]:
%%time
for grupo in [str(i) for i in range(8)]:
    print(f'\nAtencion no autorizada: {grupo}')
    X = gk_atencion.get_group(grupo)
    y = X['P9_8'].copy()
    guardarImportanciasBestRF(param_grid, X.drop(columns=['P9_8'], inplace=False), y, f'FI/Atencion_Region_{grupo}_FI.txt')

    print(f'\nAbuso y violencia: {grupo}')
    X = gk_abuso.get_group(grupo)
    y = X['P9_8'].copy()
    guardarImportanciasBestRF(param_grid, X.drop(columns=['P9_8'], inplace=False), y, f'FI/Abuso_Region_{grupo}_FI.txt')


Atencion no autorizada: 0
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 150,
 'min_samples_split': 285,
 'n_estimators': 150,
 'n_jobs': -1,
 'random_state': 5}



Abuso y violencia: 0
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 150,
 'min_samples_split': 285,
 'n_estimators': 150,
 'n_jobs': -1,
 'random_state': 5}



Atencion no autorizada: 1
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': False,
 'criterion': 'gini',
 'max_features': 'auto',
 'min_samples_leaf': 150,
 'min_samples_split': 285,
 'n_estimators': 150,
 'n_jobs': -1,
 'random_state': 5}



Abuso y violencia: 1
Fitting 5 folds for each of 3600 candidates, totalling 18000 fits

f1: 0.0
parametros:
{'bootstrap': 

In [18]:
print("Current Time is :", time.strftime("%H:%M:%S", time.localtime()))

Current Time is : 06:38:07
