In [1]:
import numpy as np
import pandas as pd
from utils.prepare_dataset import prepare_dataset
from utils.prepare_dataset import FeaturesEngineering

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import  OneHotEncoder, MinMaxScaler, LabelEncoder


# Classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

import warnings 
warnings.filterwarnings('ignore')

In [2]:
parser= lambda x: pd.to_datetime(x)
df_train = pd.read_csv('data/train_set.csv', parse_dates=['start_time', 'end_time'], date_parser=parser, low_memory=False)
df_test = pd.read_csv('data/test_set.csv', parse_dates=['start_time', 'end_time'], date_parser=parser, low_memory=False)

In [13]:
df_prep = prepare_dataset(df_train)

In [14]:
X = df_prep.drop(['plan_duration', 'passholder_type'], axis=1)
y = df_prep['passholder_type']

In [15]:
labels = LabelEncoder()
y_labels = labels.fit_transform(y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.20)

In [17]:
cat_features = ['trip_route_category']
cat_transformer = OneHotEncoder(handle_unknown='ignore')

num_features = ['duration', 'start_lat','start_lon', 'end_lat', 
                'end_lon', 'month', 'day', 'hour']
num_transformer = MinMaxScaler()

column_transform = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

In [18]:
pipe = Pipeline(
    steps=[
        ('features', FeaturesEngineering()),
        ('col_trans', column_transform),
        ('classifier', XGBClassifier())
    ]
)

grid_param = [
    {
        'classifier': [XGBClassifier()]
    },
    {
        'classifier': [AdaBoostClassifier()]
    },
    {
        'classifier': [GradientBoostingClassifier()]
    }
]

In [19]:
gridsearch = GridSearchCV(pipe, grid_param, cv=3, verbose=2, n_jobs=-1)
best_model= gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ....................classifier=AdaBoostClassifier(); total time= 3.6min


KeyboardInterrupt: 

In [64]:
best_model.best_estimator_

In [76]:
best_model.score(X_test, y_test)

0.7307223995842305

In [81]:
try:
    best_model.predict(df_test)

except Exception as e:
    pass

### Optimizacion de Hiperparametros

In [50]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
learning_rate = [0.1, 0.3, 0.5]
n_estimators = [int(x) for x in np.linspace(start=100, stop=500, num=5)]
max_depth = [3, 6, 9]

In [67]:
random_grid = {
    'classifier__learning_rate': learning_rate,
    'classifier__n_estimators': n_estimators,
    'classifier__max_depth': max_depth
}

In [68]:
gbr_random = RandomizedSearchCV(estimator=best_model.best_estimator_, param_distributions=random_grid,
                                scoring='accuracy', n_iter=5, cv=3, verbose=2, n_jobs=-1)

In [69]:
gbr_random.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=500; total time=78.2min
[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=500; total time=74.8min


KeyboardInterrupt: 