In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame

In [18]:
data = pd.read_csv('data/training_data.csv') \
    .drop(columns=['id', 'year'])

X = data.iloc[:, 0:4].astype(float)
y = data.iloc[:, 4]

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier

pipeline = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier()    
)

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'gradientboostingclassifier__max_depth': [1, 2, 3],
    'gradientboostingclassifier__n_estimators': [10, 50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.001, 0.001, 0.01, 0.1, 0.2, 0.3],
}

grid = GridSearchCV(pipeline, cv=10, param_grid=param_grid, iid=False)

In [22]:
grid.fit(X_train, y_train)
print(f'Score: {grid.best_score_} and Params: {grid.best_params_}')

Score: 0.8540072463768116 and Params: {'gradientboostingclassifier__learning_rate': 0.01, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__n_estimators': 100}


In [23]:
grid.score(X_test, y_test)

0.8852459016393442

In [25]:
import pickle

with open('model.p', 'wb') as model_file:
    pickle.dump(grid, model_file)