<a href="https://colab.research.google.com/github/alexandergribenchenko/DS_LATAM_Test/blob/main/LATAM_N_04_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

## B. Dataframe raw

In [2]:
path_github = 'https://raw.githubusercontent.com/alexandergribenchenko/DS_LATAM_Test/main/dataset_SCL.csv'             

In [3]:
df_raw = pd.read_csv(path_github, dtype=object)
df_raw

Unnamed: 0,Fecha-I,Vlo-I,Ori-I,Des-I,Emp-I,Fecha-O,Vlo-O,Ori-O,Des-O,Emp-O,DIA,MES,AÑO,DIANOM,TIPOVUELO,OPERA,SIGLAORI,SIGLADES
0,2017-01-01 23:30:00,226,SCEL,KMIA,AAL,2017-01-01 23:33:00,226,SCEL,KMIA,AAL,1,1,2017,Domingo,I,American Airlines,Santiago,Miami
1,2017-01-02 23:30:00,226,SCEL,KMIA,AAL,2017-01-02 23:39:00,226,SCEL,KMIA,AAL,2,1,2017,Lunes,I,American Airlines,Santiago,Miami
2,2017-01-03 23:30:00,226,SCEL,KMIA,AAL,2017-01-03 23:39:00,226,SCEL,KMIA,AAL,3,1,2017,Martes,I,American Airlines,Santiago,Miami
3,2017-01-04 23:30:00,226,SCEL,KMIA,AAL,2017-01-04 23:33:00,226,SCEL,KMIA,AAL,4,1,2017,Miercoles,I,American Airlines,Santiago,Miami
4,2017-01-05 23:30:00,226,SCEL,KMIA,AAL,2017-01-05 23:28:00,226,SCEL,KMIA,AAL,5,1,2017,Jueves,I,American Airlines,Santiago,Miami
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68201,2017-12-22 14:55:00,400,SCEL,SPJC,JAT,2017-12-22 15:41:00,400.0,SCEL,SPJC,JAT,22,12,2017,Viernes,I,JetSmart SPA,Santiago,Lima
68202,2017-12-25 14:55:00,400,SCEL,SPJC,JAT,2017-12-25 15:11:00,400.0,SCEL,SPJC,JAT,25,12,2017,Lunes,I,JetSmart SPA,Santiago,Lima
68203,2017-12-27 14:55:00,400,SCEL,SPJC,JAT,2017-12-27 15:35:00,400.0,SCEL,SPJC,JAT,27,12,2017,Miercoles,I,JetSmart SPA,Santiago,Lima
68204,2017-12-29 14:55:00,400,SCEL,SPJC,JAT,2017-12-29 15:08:00,400.0,SCEL,SPJC,JAT,29,12,2017,Viernes,I,JetSmart SPA,Santiago,Lima


# 03. Transformadores

## 03.01. FeatureSelector

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_FeatureSelector):
        self.feature_names = params_FeatureSelector['feature_names'] 
    
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
      X_output = X[self.feature_names]
      return X_output

## 03.02. DateType

In [5]:
class DateType(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_FeatureSelector):
        self.cols_dates = params_DateType['cols_dates'] 
    
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
      X_output = X.copy()
      for col in self.cols_dates:
        X_output[col] = pd.to_datetime(X_output[col])
      return X_output

## 03.03. TemporadaAlta

In [6]:
class TemporadaAlta(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_TemporadaAlta):
        self.columna_fecha = params_TemporadaAlta['columna_fecha'] 
    
    def fit(self, X, y = None):
        return self

    def fun_temporada_alta(self, fecha):
      mes = fecha.month
      dia = fecha.day
      if (mes == 12 and dia >= 15) or \
      (mes in [1, 2]) or \
      (mes == 3 and dia <= 3) or \
      (mes == 7 and (dia >= 15 and dia <= 31)) or \
      (mes == 9 and (dia >= 11 and dia <= 30)):
          return 1
      else:
          return 0
    
    def transform(self, X, y = None):
      X_output = X.copy()
      # X_output[self.columna_fecha] = pd.to_datetime(X_output[self.columna_fecha])
      X_output['temporada_alta'] = X_output[self.columna_fecha].apply(self.fun_temporada_alta)
      return X_output

## 03.04. DifMin

In [7]:
class DifMin(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_DifMin):
        self.empty = params_DifMin 
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
      X_output = X.copy()
      X_output['dif_min'] = (X_output['Fecha-O']-X_output['Fecha-I'])/pd.Timedelta(minutes=1)
      return X_output

## 03.05. Atraso15

In [8]:
class Atraso15(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_Atraso15):
        self.empty = params_Atraso15 
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
      X_output = X.copy()
      X_output['atraso_15'] = (X_output['dif_min'] > 15).astype(int)
      return X_output

## 03.05. PeriodoDia

In [9]:
class PeriodoDia(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_PeriodoDia):
        self.columna_fecha = params_PeriodoDia['columna_fecha'] 
    
    def fit(self, X, y = None):
        return self

    def fun_periodo_dia(self, fecha):
      hora = fecha.hour
      if hora >= 5 and hora <= 11:
          return 'mañana'
      elif hora >= 12 and hora <= 18:
          return 'tarde'
      else:
          return 'noche'
    
    def transform(self, X, y = None):
      X_output = X.copy()
      X_output['periodo_dia'] = X_output[self.columna_fecha].apply(self.fun_periodo_dia)
      return X_output

## 03.05. OrderOutput

In [10]:
class OrderOutput(BaseEstimator, TransformerMixin):
    
    def __init__(self, params_OrderOutput):
        self.feature_names = params_OrderOutput['feature_names'] 
    
    def fit(self, X, y = None):
        return self 
    
    def transform(self, X, y = None):
      X_output = X[self.feature_names]
      return X_output

# 04. Pipeline

## 04.01. Parametros

In [11]:
params_FeatureSelector = {}
params_FeatureSelector['feature_names']= ['Fecha-I', 
                                          'Des-I',
                                          'Emp-I', 
                                          'TIPOVUELO', 
                                          'Fecha-O',
                                         ]

params_DateType = {}
params_DateType['cols_dates']= ['Fecha-I', 'Fecha-O']

params_TemporadaAlta = {}
params_TemporadaAlta['columna_fecha']= 'Fecha-I'

params_DifMin = {}

params_Atraso15 = params_DifMin

params_PeriodoDia = params_TemporadaAlta

params_OrderOutput = {}
params_OrderOutput['feature_names']= ['Des-I',
                                      'Emp-I', 
                                      'TIPOVUELO',
                                      'temporada_alta',
                                      'periodo_dia',
                                      'atraso_15'	
                                      ]

In [12]:
Transformer_FeatureSelector = FeatureSelector(params_FeatureSelector)
Transformer_DateType = DateType(params_DateType)
Transformer_TemporadaAlta = TemporadaAlta(params_TemporadaAlta)
Transformer_DifMin = DifMin(params_DifMin)
Transformer_Atraso15 = Atraso15(params_Atraso15)
Transformer_PeriodoDia = PeriodoDia(params_PeriodoDia)
Transformer_OrderOutput = OrderOutput(params_OrderOutput)

In [13]:
pipeline_data_model = Pipeline(steps=[('NameFeatureSelector', Transformer_FeatureSelector),
                              ('NameDateType', Transformer_DateType),
                              ('NameTemporadaAlta', Transformer_TemporadaAlta),
                              ('NameDifMin', Transformer_DifMin),
                              ('NameDateAtraso15', Transformer_Atraso15),
                              ('NamePeriodoDia', Transformer_PeriodoDia),
                              ('NameOrderOutput', Transformer_OrderOutput)
                              ])

In [14]:
df_data_model = pipeline_data_model.transform(df_raw)
df_data_model

Unnamed: 0,Des-I,Emp-I,TIPOVUELO,temporada_alta,periodo_dia,atraso_15
0,KMIA,AAL,I,1,noche,0
1,KMIA,AAL,I,1,noche,0
2,KMIA,AAL,I,1,noche,0
3,KMIA,AAL,I,1,noche,0
4,KMIA,AAL,I,1,noche,0
...,...,...,...,...,...,...
68201,SPJC,JAT,I,1,tarde,1
68202,SPJC,JAT,I,1,tarde,1
68203,SPJC,JAT,I,1,tarde,1
68204,SPJC,JAT,I,1,tarde,0


# 05. Train - Test Split

In [21]:
X = df_data_model.drop('atraso_15', axis=1)
y = df_data_model.atraso_15

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [30]:
y.value_counts(normalize=True)

0    0.81506
1    0.18494
Name: atraso_15, dtype: float64

In [28]:
y_train.value_counts(normalize=True)

0    0.815055
1    0.184945
Name: atraso_15, dtype: float64

In [29]:
y_test.value_counts(normalize=True)

0    0.815072
1    0.184928
Name: atraso_15, dtype: float64

# 05. Pipeline modelo

In [37]:
X.columns

Index(['Des-I', 'Emp-I', 'TIPOVUELO', 'temporada_alta', 'periodo_dia'], dtype='object')

In [38]:
ohe_cols = ['Des-I', 'Emp-I', 'TIPOVUELO', 'temporada_alta', 'periodo_dia']

In [39]:
ohe_transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), ohe_cols)
    ], remainder='passthrough')

In [40]:
pipeline = Pipeline(steps=[('ohe_transformer', ohe_transformer)])

In [63]:
X_prep = pipeline.fit_transform(X)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_prep, y, stratify=y, test_size=0.3, random_state=42)

In [65]:
logReg = LogisticRegression(class_weight = 'balanced')
model = logReg.fit(X_train, y_train)

In [69]:
y_pred = model.predict(X_test)

In [84]:
y_pred = model.predict_proba(X_test)

In [70]:
confusion_matrix(y_test, y_pred)

array([[10745,  5933],
       [ 1747,  2037]])

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.64      0.74     16678
           1       0.26      0.54      0.35      3784

    accuracy                           0.62     20462
   macro avg       0.56      0.59      0.54     20462
weighted avg       0.75      0.62      0.66     20462



In [79]:
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt.fit(X_train, y_train)

In [80]:
y_pred = dt.predict(X_test)

In [85]:
y_pred = dt.predict_proba(X_test)

In [86]:
y_pred

array([[0.64319851, 0.35680149],
       [0.52360498, 0.47639502],
       [0.56823629, 0.43176371],
       ...,
       [0.74188228, 0.25811772],
       [1.        , 0.        ],
       [0.69995203, 0.30004797]])

In [81]:
confusion_matrix(y_test, y_pred)

array([[10704,  5974],
       [ 1641,  2143]])

In [82]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.64      0.74     16678
           1       0.26      0.57      0.36      3784

    accuracy                           0.63     20462
   macro avg       0.57      0.60      0.55     20462
weighted avg       0.76      0.63      0.67     20462



In [83]:
dt.feature_importances_

array([4.49321659e-03, 0.00000000e+00, 3.99293763e-04, 1.50794507e-02,
       1.11231023e-03, 1.68632263e-02, 4.80311191e-04, 3.44872784e-04,
       2.23116791e-04, 2.51518663e-03, 2.70021933e-03, 0.00000000e+00,
       0.00000000e+00, 7.48493299e-04, 1.72049837e-03, 5.96893224e-04,
       7.84909175e-02, 6.89222227e-03, 2.48744832e-03, 9.76609489e-03,
       4.52644416e-03, 9.45374276e-03, 2.99688436e-03, 4.75277893e-05,
       9.68952391e-05, 2.04596127e-04, 5.56096431e-05, 5.73080559e-04,
       6.88889204e-04, 1.39192075e-04, 7.86490094e-05, 3.64024991e-03,
       1.29485844e-02, 1.51810698e-02, 7.12190184e-03, 2.66218409e-03,
       6.84821249e-03, 1.05503298e-02, 7.77265413e-03, 1.08649259e-02,
       4.05326749e-03, 5.45369397e-03, 6.76380524e-03, 4.72933297e-03,
       2.10524507e-03, 3.52389731e-03, 5.55665739e-03, 6.03803581e-03,
       8.68264838e-03, 2.28866566e-03, 1.17550067e-03, 0.00000000e+00,
       0.00000000e+00, 8.63663237e-05, 1.65480900e-02, 4.63370905e-04,
      

In [None]:
import matplotlib.pyplot as plt

In [None]:
steps=[("standard_scaler",StandardScaler()),
      ("classifier",LogisticRegression())]

In [None]:
numeric_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

In [None]:
categorical_features = ['UniqueCarrier', 'Month']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

In [33]:
logReg = LogisticRegression()
model = logReg.fit(X_train, y_train)

ValueError: ignored

In [15]:
# df_input = df_raw[['Fecha-I', 'Des-I', 'Emp-I']]
# df_input