In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/data.csv")
df.shape

In [None]:
df.describe(include=['number']).loc[["mean","min","max"]]

In [None]:
df.describe(include=['object'])

In [None]:
df.iloc[:,:10].head()

In [None]:
df.iloc[:,10:20].head()

In [None]:
df.iloc[:,20:].head()

In [None]:
df.game_id.unique().shape, df.game_date.unique().shape

In [None]:
df.season.unique()

In [None]:
df.team_id.unique(), df.team_name.unique()

In [None]:
df.game_event_id.unique().shape

In [None]:
df.action_type.unique()

In [None]:
_ = df[(df["minutes_remaining"] == 0) & (df["seconds_remaining"] < 10)]
_.mean()["shot_made_flag"], _.count()["shot_made_flag"]

In [None]:
df["game_year"] = df["game_date"].str[0:4].astype(int)
df["game_month"] = df["game_date"].str[5:7].astype(int)
df['action_first_words'] = df["action_type"].str.split(' ').str[0]
df['action_last_words'] = df["action_type"].str.split(' ').str[-2]
df['season_start_year'] = df.season.str.split('-').str[0].astype(int)

df["remaining"] = df["minutes_remaining"] * 60 + df["seconds_remaining"]
df["hurry_shot"] = ((df["minutes_remaining"] == 0) & (df["seconds_remaining"] < 10)).astype(int)
df["home_game"] = df["matchup"].apply(lambda x: 1 if (x.find('@') < 0) else 0)

df['distance_bin'] = pd.cut(df.shot_distance, bins=10, labels=range(10))

import math as m
df["angle"] = df.apply(lambda row: 90 if row["loc_y"]==0 else m.degrees(m.atan(row["loc_x"]/abs(row["loc_y"]))),axis=1)
df["angle_bin"] = pd.cut(df.angle, 7, labels=range(7)).astype(int)

df.drop(["team_id", "team_name", "game_date", "game_event_id", "matchup"], axis=1, inplace=True)

In [None]:
nullcount = df.isnull().sum()
nullcount[nullcount > 0]

In [None]:
df.shot_made_flag.mean()

In [None]:
_ = pd.concat([df.game_id, df.period, df.shot_made_flag, df.game_id.shift(1), df.period.shift(1), df.shot_made_flag.shift(1)], axis=1)
_.columns = ["game_id", "period", "shot_made_flag", "pre_game_id", "pre_period", "pre_shot_made_flag"]
_.dropna()
_ = _[(_["game_id"] == _["pre_game_id"]) & (_["period"] == _["pre_period"])]
_.groupby(["pre_shot_made_flag"]).mean()["shot_made_flag"]

In [None]:
df_enc = df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

for i, t in df_enc.dtypes.iteritems():
    if t == object:
        le = LabelEncoder()
        le.fit(df_enc[i].astype(str))
        df_enc[i] = le.transform(df_enc[i].astype(str))

corr = df_enc.corr()
corr_shot_made = pd.DataFrame(corr.shot_made_flag.abs().sort_values(ascending=False))
corr_shot_made[1:15]

In [None]:
train = df[~df.shot_made_flag.isnull()]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))

train.groupby(["shot_distance"]).mean()["shot_made_flag"].plot(ylim=(0,1), ax=axes[0])

_g = train.groupby(train["shot_distance"] // 5 * 5)
_ = pd.concat([_g.count()["shot_made_flag"], _g.mean()["shot_made_flag"]], axis=1)
_.columns = ["shot_count", "shot_mean"]
_[_.shot_count >= 10].plot.bar(y="shot_mean", ylim=(0,1), ax=axes[1])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(18,4))
train.groupby(["shot_zone_range"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[0])
train.groupby(["shot_zone_basic"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[1])
train.groupby(["shot_zone_area"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[2])
train.groupby(["combined_shot_type"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[3])
train.groupby(["action_first_words"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[4])
train.groupby(["action_last_words"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(ylim=(0,1), ax=axes[5])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(18,4))
train.groupby(["shot_type"]).mean()["shot_made_flag"].plot.bar(ylim=(0,1), ax=axes[0])
train.groupby(["home_game"]).mean()["shot_made_flag"].plot.bar(ylim=(0,1), ax=axes[1])
train.groupby(["hurry_shot"]).mean()["shot_made_flag"].plot.bar(ylim=(0,1), ax=axes[2])
train.groupby(["period"]).mean()["shot_made_flag"].plot.bar(ylim=(0,1), ax=axes[3])
train.groupby(["angle_bin"]).mean()["shot_made_flag"].plot.bar(ylim=(0,1), ax=axes[4])

In [None]:
train.groupby(["opponent"]).mean()["shot_made_flag"].sort_values(ascending=False).plot.bar(figsize=(18,4))

In [None]:
categorial_cols = df.describe(include=['object']).columns

df_dummy = df.copy()[categorial_cols.tolist() + ["shot_made_flag"]]

for cc in categorial_cols:
    dummies = pd.get_dummies(df_dummy[cc])
    dummies = dummies.add_prefix("{}#".format(cc))
    df_dummy.drop(cc, axis=1, inplace=True)
    df_dummy = df_dummy.join(dummies)

df_dummy = df_dummy[df_dummy.shot_made_flag.isnull() == False]
fitX = df_dummy.drop(["shot_made_flag"], axis=1)
fitY = df_dummy["shot_made_flag"]

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(fitX, fitY)

feature_imp = pd.DataFrame(model.feature_importances_, index=fitX.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20)
feat_imp_20

In [None]:
train = df_enc[~df_enc.shot_made_flag.isnull()]
test = df_enc[df_enc.shot_made_flag.isnull()]

In [None]:
col_name = [
    "shot_distance",
    "shot_zone_range",
    "loc_y",
    "shot_zone_basic",
    "shot_type",
    "shot_zone_area",
    "angle_bin",
    "combined_shot_type",
    "hurry_shot",
    "action_first_words",
    "action_last_words"
#    "period",
#    "seconds_remaining"
]

X_train = train[col_name].copy()
Y_train = train['shot_made_flag']
X_test  = test[col_name].copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
#from sklearn.kernel_ridge import KernelRidge
#clf = KernelRidge(alpha=1.0, kernel='polynomial', degree=2, coef0=2.5)

#from sklearn.svm import SVR
#clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
#clf = SVR(kernel='linear', C=1e3)
#clf = SVR(kernel='poly', C=1e3, degree=2)

#from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
#clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, max_samples=0.9, max_features=0.2)
clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, max_samples=0.3)

clf.fit(X_train, Y_train)
result = clf.predict(X_test)

_ = '''
# 0.65490
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1, n_estimators=500)
cols = ["shot_distance","shot_zone_range","shot_zone_basic","action_last_words"]

clf.fit(X_train[cols], Y_train)
result = clf.predict_proba(X_test[cols])[:, 1]
'''

In [None]:
submission = pd.DataFrame({
    "shot_id": test["shot_id"],
    "shot_made_flag": result
})
submission.to_csv("submission.csv", index=False)

In [None]:
submission