### Load the libraries

In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from joblib import dump

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.tree import DecisionTreeClassifier

### Load the data

In [31]:
df = pd.read_csv('../data/raw/train.csv')
#df.describe()
#df.info()

In [32]:
df = df.iloc[:, 2:]
df.head(5)
#df.shape

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


### Explore Data and Quality Check

In [4]:
df.isnull().values.any()
df.isnull().sum()

Id_old         0
Id             0
GP             0
MIN            0
PTS            0
FGM            0
FGA            0
FG%            0
3P Made        0
3PA            0
3P%            0
FTM            0
FTA            0
FT%            0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
TARGET_5Yrs    0
dtype: int64

### Prepare the data

In [33]:
df_cleaned = df.copy()
# Strip the column names if they contain spaces
df_cleaned.columns = df_cleaned.columns.str.strip()

# Extract the target column
target = df_cleaned.pop('TARGET_5Yrs')
#df_cleaned.head()

# Scaling
scaler = StandardScaler()
df_cleaned = scaler.fit_transform(df_cleaned)
df_cleaned

# Save the scaler 
dump(scaler, '../models/scaler.joblib')

# Split the original dataset into Data-Test set
X_data, X_test, y_data, y_test = train_test_split(df_cleaned, target, test_size = 0.2, random_state=8 )

# Split X-data again into Train-Validation set 
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size = 0.2, random_state=8)

# Save the splited data
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val', X_val)
np.save('../data/processed/X_test', X_test)

np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val', y_val)
np.save('../data/processed/y_test', y_test)

### Baseline Model

In [34]:
## Get the mean value of Target 1 and zero
y_mean = y_train.mean() # 0.837109375

## Replace the Target value 1 with mean value
y_base = np.full((len(y_train), 1), y_mean)

## Check - How far the mean value is from the actual y -value 
print("MSE", mse(y_train, y_base)) # 0.1363572692871094
print("MAE", mae(y_train, y_base)) # 0.27271453857421873

MSE 0.1363572692871094
MAE 0.27271453857421873


### Build The Decision Tree Model

In [35]:
model = DecisionTreeClassifier(max_depth = 10)
model.fit(X_train,y_train )

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### Make Predictions and Check Model Accuracy on Training Data Set

In [36]:
y_train_prediction = model.predict(X_train)
y_train_prediction
confusion_matrix(y_train, y_train_prediction) # 
accuracy_score(y_train, y_train_prediction)

0.901171875

### Make Predictions and Check Model Accuracy on Valiation Data Set

In [37]:
y_val_prediction = model.predict(X_val)
y_val_prediction
confusion_matrix(y_val, y_val_prediction)
accuracy_score(y_val, y_val_prediction)

0.78828125

### Load Test Data Set

In [38]:
df_test = pd.read_csv('../data/raw/test.csv')
df_test.head()

df_test = df_test.iloc[:, 2:]
df_test.head(5)
df_test.shape

(3799, 19)

### Test the result

In [39]:
df_test_cleaned = df_test.copy()

# Strip the column names if they contain spaces
df_test_cleaned.columns = df_test_cleaned.columns.str.strip()

scaler = StandardScaler()
df_test_cleaned = scaler.fit_transform(df_test_cleaned)

df_test_cleaned.shape
y_test_prediction = model.predict(df_test_cleaned)

y_test_prediction

final_prediction_test = pd.DataFrame({'Id': range(0,3799), 'TARGET_5Yrs': [p for p in y_test_prediction]})

final_prediction_test.to_csv("../reports/tin_submission_01.csv", index=False)

In [41]:
final_prediction_test.groupby('TARGET_5Yrs').sum()

Unnamed: 0_level_0,Id
TARGET_5Yrs,Unnamed: 1_level_1
0,564361
1,6649940
