In [28]:
#%load_ext pycodestyle_magic
#%pycodestyle_on

import pandas as pd
import numpy as np
import pickle
import time
import datetime
import re

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [29]:
# define categorical, numerical and date columns
CAT_COLS = ["dia_semana", "codigo_cierre", "año_cierre", "mes_cierre", "mes", "delegacion_inicio",
            "incidente_c4", "clas_con_f_alarma", "tipo_entrada", "delegacion_cierre", "hora_creacion",
           "hora_cierre"]

DATE_COLS = ["fecha_creacion", "fecha_cierre"]

NUM_COLS = ["latitud", "longitud"]#esto irá en el script ingestion .py
def ingest_file(file_name):
    """
    Function to retrieve and return the accidents dataset.
    Parameters:
    -----------
    file_name: str
               Path to the file.
    Returns:
    --------
    df: pandas dataframe
    """
    df = pd.read_csv(file_name)
    return df


def drop_cols(df):
    """
    Function to drop unnnecesary columns in the dataset.
    """
    df.drop(columns = ['folio', 'geopoint', 'mes', 'mes_cierre', 'hora_cierre', 'año_cierre'], inplace = True)
    return df



def fill_na(df):
    """
    Function to fill null values in a dataframe.
    """
    #aquí podemos ir agregando más cosas cuando descubramos 
    #cómo imputar valores faltantes para latitud y longitud
    df.fillna({
        'delegacion_inicio': 'No Disponible',
        'delegacion_cierre': 'No Disponible'
              }, inplace = True)
    return df


def categoric_transformation(col,df):
    df[col] = df[col].astype("category")
    return df 

def create_categorical(cols, df):
    """
    Function to transform and prepare the categorical features in the dataset.
    """
    #transform to appropriate data type
    for col in cols: 
        df = categoric_transformation(col, df)
     
    return df


def date_transformation(col,df):
    """
    Function to prepare and transform date-type columns. 
    """
    df[col] = pd.to_datetime(df[col], dayfirst=True)
    return df

def create_date_cols(cols, df):
    for col in cols:
        df = date_transformation(col, df)
    return df 



def generate_label(df):
    """
    Function to create a new column indicating whether there was
    a false alarm or not. 
    Parameters:
    -----------
    df: pandas dataframe
    
    Returns:
    --------
    df: pandas dataframe
    """
    #transformamos la columna para solo quedarnos con la letra del código
    df["codigo_cierre"] = df["codigo_cierre"].apply(lambda x: x[1])
    df['label'] = np.where(
        (df.codigo_cierre == 'F') | (df.codigo_cierre == 'N'), 1, 0)
    return df 


def clean_hora_creacion(df):
    """
    Function to transform hours with incorrect format to timedelta format. 
    """
    horas_raw = df.hora_creacion.values.tolist()
    horas_clean = [datetime.timedelta(days=float(e)) if e.startswith("0.") else e for e in horas_raw]
    df["hora_creacion"] = horas_clean
    return df 


def create_simple_hour(df):
    """
    Function to extract the hour from the column "hora_creacion"
    Parameters:
    -----------
    df: pandas dataframe
    
    Returns:
    ---------
    df: pandas dataframe with a new column indicating the hour. 
    """
    #la función se podria adaptar para devolver minuto o segundo pero no lo considero necesario
    pattern = '\d+' #encuentra uno o más dígitos
    horas_raw = df.hora_creacion.astype(str).values #son así: '22:35:04', '22:50:49', '09:40:11'
    n = len(horas_raw)
    horas_clean = [0]*n #es más rápido reasignar valores que hacer .append()
    for i in range(n):
        hora_raw = horas_raw[i]
        hora_clean = re.match(pattern, hora_raw)[0] #solo queremos la hora, esto devuelve un objeto
        horas_clean[i] = hora_clean
    
    df["hora_simple"] = horas_clean
    return df 


def add_date_columns(df):
    """
    Esta función es muy importante puesto que nos ayudará a crear el mes, día y año de creación
    del registro. De esta manera podemos prescindir de las fechas de cierre, que no tendríamos en tiempo
    real en un modelo. 
    Parameters:
    -----------
    df: pandas dataframe
    
    Returns:
    ---------
    df: pandas dataframe with 4 new columns
    """
    mapping_meses = {1: "Enero", 2: "Febrero", 3: "Marzo", 4: "Abril", 5: "Mayo",
                       6: "Junio", 7: "Julio", 8: "Agosto", 9: "Septiembre", 10: "Octubre",
                       11: "Noviembre", 12: "Diciembre"}
    
    
    df["año_creacion"] = df.fecha_creacion.dt.year
    df["mes_creacion"] = df.fecha_creacion.dt.month
    df["dia_creacion"] = df.fecha_creacion.dt.day
    df["mes_creacion_str"] = df.mes_creacion.map(mapping_meses)
    df["año_creacion"] = df["año_creacion"].astype(str)
    return df 


def create_time_blocks(df):
    """
    Function to group the hour of the day into 3-hour blocks.
    Parameters:
    -----------
    df: pandas dataframe
    
    Returns:
    ---------
    df: pandas dataframe with a new column indicating the time-block.
    """
    horas_int = set(df.hora_simple.astype(int).values) #estaba como categórico
    f = lambda x: 12 if x == 0 else x
    mapping_hours = {}
    for hora in horas_int:
        grupo = (hora // 3) * 3
        if grupo < 12: 
            nombre_grupo = str(f(grupo)) + "-" + str(grupo + 2) + " a.m."
        else:
            hora_tarde = grupo % 12
            nombre_grupo = str(f(hora_tarde)) + "-" + str(hora_tarde + 2) + " p.m."
        mapping_hours[hora] = nombre_grupo
    
    df["espacio_del_dia"] = df["hora_simple"].astype(int).map(mapping_hours)
    return df
    

    

    
def basic_preprocessing(path):
    """
    Function to summarize all the preprocessing done to the data.
    Parameters:
    -----------
    path: str
          Path to your file
    
    Returns:
    ---------
    df: pandas dataframe
    """
    df = ingest_file(path) 
    df = generate_label(df)
    df = fill_na(df) 
    df = clean_hora_creacion(df)
    df = create_categorical(CAT_COLS, df) #transform to appropriate data types
    df = create_date_cols(DATE_COLS, df)
    df = add_date_columns(df)
    df = create_simple_hour(df)
    df = create_time_blocks(df)
    df = drop_cols(df)
    
    return df

In [60]:
# Hacemos un preprocesamiento básico
incidentes_viales_df = basic_preprocessing('incidentes-viales-c5.csv')

In [61]:
 # Eliminamos las columnas que no queremos
 incidentes = incidentes_viales_df.drop(['codigo_cierre',
                                           'fecha_creacion', 'fecha_cierre',
                                           'hora_creacion', 'clas_con_f_alarma',
                                           'año_creacion', 'dia_semana', 'mes_creacion_str',
                                           'delegacion_cierre'
                                           ], axis=1)

In [62]:
# Eliminamos los renglones NaN
incidentes = incidentes[incidentes['incidente_c4'].notna()]



In [None]:
# Establecemos nuestras X y Y
#X = incidentes[['dia_creacion', 'delegacion_inicio', 'incidente_c4',
#                'tipo_entrada', 'espacio_del_dia', 'mes_creacion',
#                'hora_simple']]
#y = incidentes['label']

## Feature generation


In [65]:
incidentes.hora_simple = incidentes.hora_simple.astype(int)

In [66]:
# Transformaciones cíclicas

HOURS = 24
MONTHS = 12
DAYS = 7

incidentes['sin_hr'] = np.sin(2*np.pi*incidentes.hora_simple/HOURS)
incidentes['cos_hr'] = np.cos(2*np.pi*incidentes.hora_simple/HOURS)

incidentes['sin_month'] = np.sin(2*np.pi*incidentes.mes_creacion/MONTHS)
incidentes['cos_month'] = np.cos(2*np.pi*incidentes.mes_creacion/MONTHS)

incidentes['sin_day'] = np.sin(2*np.pi*incidentes.dia_creacion/DAYS)
incidentes['cos_day'] = np.cos(2*np.pi*incidentes.dia_creacion/DAYS)

incidentes

Unnamed: 0,delegacion_inicio,incidente_c4,latitud,longitud,tipo_entrada,label,mes_creacion,dia_creacion,hora_simple,espacio_del_dia,sin_hr,cos_hr,sin_month,cos_month,sin_day,cos_day
0,VENUSTIANO CARRANZA,accidente-choque sin lesionados,19.422113,-99.084643,BOTÓN DE AUXILIO,0,1,23,22,9-11 p.m.,-5.000000e-01,0.866025,0.500000,0.866025,0.974928,-0.222521
1,CUAJIMALPA,accidente-choque con lesionados,19.358320,-99.297641,BOTÓN DE AUXILIO,0,1,23,22,9-11 p.m.,-5.000000e-01,0.866025,0.500000,0.866025,0.974928,-0.222521
2,TLALPAN,accidente-choque sin lesionados,19.217070,-99.219070,LLAMADA DEL 066,1,1,24,9,9-11 a.m.,7.071068e-01,-0.707107,0.500000,0.866025,0.433884,-0.900969
3,MAGDALENA CONTRERAS,accidente-choque sin lesionados,19.320580,-99.241010,LLAMADA DEL 066,1,1,24,22,9-11 p.m.,-5.000000e-01,0.866025,0.500000,0.866025,0.433884,-0.900969
4,MIGUEL HIDALGO,accidente-choque sin lesionados,19.452900,-99.215870,LLAMADA DEL 066,0,1,24,4,3-5 a.m.,8.660254e-01,0.500000,0.500000,0.866025,0.433884,-0.900969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1383133,GUSTAVO A. MADERO,lesionado-atropellado,19.550230,-99.151710,LLAMADA DEL 911,0,10,31,11,9-11 a.m.,2.588190e-01,-0.965926,-0.866025,0.500000,0.433884,-0.900969
1383134,CUAUHTEMOC,accidente-ciclista,19.443780,-99.136960,LLAMADA DEL 911,0,10,31,11,9-11 a.m.,2.588190e-01,-0.965926,-0.866025,0.500000,0.433884,-0.900969
1383135,COYOACAN,accidente-choque con lesionados,19.346310,-99.144900,LLAMADA DEL 911,0,10,31,12,12-2 p.m.,1.224647e-16,-1.000000,-0.866025,0.500000,0.433884,-0.900969
1383136,IZTAPALAPA,accidente-choque sin lesionados,19.367560,-99.056450,LLAMADA DEL 911,1,10,31,15,3-5 p.m.,-7.071068e-01,-0.707107,-0.866025,0.500000,0.433884,-0.900969



### Pre-procesamiento con OneHotEncoder

In [68]:
transformers = [('one_hot', OneHotEncoder(), ['delegacion_inicio', 'incidente_c4',
                                               'tipo_entrada', 'espacio_del_dia'])]

col_trans = ColumnTransformer(transformers, remainder="drop", n_jobs=-1, verbose=True)

In [69]:
  col_trans.fit(incidentes)

ColumnTransformer(n_jobs=-1, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('one_hot',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=True),
                                 ['delegacion_inicio', 'incidente_c4',
                                  'tipo_entrada', 'espacio_del_dia'])],
                  verbose=True)

In [70]:
incidente_input_vars = col_trans.transform(incidentes)

In [71]:
incidente_input_vars.shape

(1383138, 60)

In [73]:
X = incidente_input_vars
y = incidentes.label.values.reshape(incidente_input_vars.shape[0],)

In [74]:
np.random.seed(1993)

## Feature selection

In [75]:
variance_threshold = VarianceThreshold(threshold=0.07)
variance_threshold.fit(incidente_input_vars)

VarianceThreshold(threshold=0.07)

In [76]:
variance_threshold.transform(incidente_input_vars)

<1383138x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4441001 stored elements in Compressed Sparse Row format>

In [77]:
variance_threshold.variances_

array([6.83507347e-02, 4.20408739e-02, 6.97353427e-02, 7.32931485e-02,
       1.89410199e-02, 9.17748871e-02, 9.36266865e-02, 4.05505916e-02,
       1.35569539e-01, 1.42943125e-02, 7.67517022e-02, 5.02532276e-03,
       1.14219949e-04, 2.35786241e-02, 5.91581834e-02, 6.35880865e-02,
       3.65665256e-02, 2.16897626e-06, 1.75888331e-01, 1.68461965e-03,
       2.47515873e-01, 1.96123515e-03, 3.97630699e-05, 2.24123010e-05,
       3.86131149e-02, 1.77900320e-03, 4.03739193e-03, 8.09819018e-04,
       3.54141387e-04, 6.15611151e-04, 2.24391510e-02, 7.82389038e-04,
       1.42587354e-03, 1.10605793e-04, 5.07283948e-04, 4.35784544e-03,
       1.20355062e-01, 3.97630699e-05, 7.22993134e-07, 2.68881304e-04,
       1.44598522e-06, 3.90264149e-04, 1.66285776e-05, 5.85590566e-05,
       5.36171834e-02, 2.36291730e-03, 6.80914207e-03, 2.22794267e-01,
       2.48807168e-01, 5.30522411e-02, 3.38436906e-03, 4.64227888e-03,
       6.38177623e-02, 1.25450057e-01, 3.81468199e-02, 1.46773390e-01,
      

# GridSearch


In [None]:
# ocuparemos un RF
classifier = RandomForestClassifier(oob_score=True, random_state=1993)
# separando en train, test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# definicion de los hiperparametros que queremos probar
hyper_param_grid = {'n_estimators': [200, 500], 
                    'max_depth': [5, 8, 10],
                    'min_samples_split': [2, 5]}

# ocupemos grid search!
gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision',
                           cv = 3, # temporales y no aleatorios
                           n_jobs = -1)
start_time = time.time()
gs.fit(X, y)
print("Tiempo en ejecutar: ", time.time() - start_time)

In [None]:
# Obtenemos los mejores parámetros
gs.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_split': 2,
 'n_estimators': 200}

In [None]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=1993,
                       verbose=0, warm_start=False)

In [None]:
gs.best_estimator_.oob_score_

0.7909763601937962

## Features importantes

In [None]:
# Obtenemos los estimadores de los features importantes
feature_imp = pd.Series(gs.best_estimator_.feature_importances_,index=col_trans.get_feature_names()).sort_values(ascending=False)

In [None]:
feature_imp

## Prediction

In [None]:
predicted_labels = gs.predict(X_test)

In [None]:
predicted_scores = gs.predict_proba(X_test)

In [None]:
predicted_scores

array([[0.88651998, 0.11348002],
       [0.74214685, 0.25785315],
       [0.82098888, 0.17901112],
       ...,
       [0.81857563, 0.18142437],
       [0.74479317, 0.25520683],
       [0.73859786, 0.26140214]])