In [None]:
# %load my_imports.ipy
# Stdlib
from functools import partial
from itertools import chain
from pprint import pprint as print

import numpy as np
import pandas as pd

pd.set_option('display.max_columns',102)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width',120)
import sklearn
print(f'numpy {np.__version__} pandas {pd.__version__} sklearn {sklearn.__version__}')

# Visualization

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from jupyterthemes import jtplot

custom_style = {'axes.labelcolor': 'white', 'xtick.color': 'white', 'ytick.color': 'white', }
sns.set_style( rc=custom_style)
mpl.rcParams['figure.figsize']=(10,10)
plt.rcParams['figure.figsize']=(10,10)
jtplot.style('monokai')
# Custom stuff
from swozny_ml import *
from genetic.param_opt import tune_params_genetic


In [None]:
# %load classify.py
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score

from genetic.estimator_params import params
from genetic.param_opt import tune_params_genetic
from swozny_ml import benchmark_models, tune_params

# Config

In [None]:
filename = 'data.csv'
index = 'shot_id'
target = 'shot_made_flag'
prediction_params = ['Angle', 'Distance']
scoring = 'accuracy'
voting = 'soft'

# Read data
data = pd.read_csv(filename, index_col=index)
print(data.head())

# Prepare data

In [None]:
training_cond = data[target].notnull()
X_train = data[training_cond][prediction_params]
y_train = data[training_cond][target]

# Benchmark different estimators

In [None]:
classifiers = [estimator for estimator in params if "Classifier" in estimator.__name__] + [LogisticRegression]
benchmark = benchmark_models(classifiers, X_train, y_train, scoring='accuracy')

# Report estimator performances

In [None]:
considered_algorithms = benchmark.sort_values('Mean').tail(5)
plot_benchmark(benchmark)
print(considered_algorithms)

# Tune the best estimators

In [None]:
if True:
    tuned = tune_params(considered_algorithms['Algorithm'], X_train, y_train, scoring='accuracy')
else:
    tuned = tune_params_genetic(considered_algorithms['Algorithm'], X_train, y_train, scoring=scoring)

# Calibrate the tuned estimators

In [None]:
calibrated = [CalibratedClassifierCV(model).fit(X_train, y_train) for model in tuned]

# Show Correlation between estimators

In [None]:
predictions = pd.concat(
    [pd.Series(model.predict(X_train), name=type(model.base_estimator).__name__) for model in calibrated], axis=1)
sns.heatmap(predictions.corr())

# Create an ensemble

In [None]:
survival_model = VotingClassifier([(type(model.base_estimator).__name__, model) for model in calibrated], voting=voting)
survival_model.fit(X_train, y_train)

# Predict

In [None]:
y_pred = cross_val_predict(survival_model, X=X_train, y=y_train)
score = cross_val_score(survival_model, X=X_train, y=y_train)
print(f"Final cross validation score is {score}")

# Confusion

In [None]:
sns.heatmap(confusion_matrix(y_pred, y_train), annot=True)

# Generate output

In [None]:
test_cond = ~training_cond
X_test = data[test_cond][prediction_params]
y_pred = survival_model.predict_proba(X_test)
X_test['shot_made_flag'] = y_pred[:, 1]
X_test.shot_made_flag.to_csv('pred_kobe.csv', header=True)