## (2) Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as scs
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('bam_data.csv')
df.head()

Unnamed: 0,approach_vertical,vertical_jump,three_quarter_court_sprint,four_way_agility,reaction_shuttle,bamscore,wingspan,reach,height,weight,body_comp,hand_length,hand_width,bam_score_rank
0,33.5,28.5,3.376,11.471,3.669,2003.0,72.75,94.0,70.0,174.4,9.8,7.5,8.25,4
1,30.5,21.5,3.486,12.114,3.355,1865.0,82.0,104.5,79.5,188.4,21.9,7.5,8.75,3
2,37.0,31.0,3.23,12.036,3.562,2005.0,81.5,99.0,74.0,196.5,13.9,9.0,9.5,4
3,29.0,23.0,3.37,12.509,3.173,1902.0,79.5,101.0,77.5,205.0,10.6,8.25,9.25,4
4,31.0,26.0,3.389,12.724,3.316,1903.0,77.0,101.5,78.0,180.0,15.4,8.0,10.0,4


In [3]:
df.describe()
# still have 0's in two columns

Unnamed: 0,approach_vertical,vertical_jump,three_quarter_court_sprint,four_way_agility,reaction_shuttle,bamscore,wingspan,reach,height,weight,body_comp,hand_length,hand_width,bam_score_rank
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,31.829615,25.860157,3.467047,12.247189,3.505243,1890.976326,78.089797,98.71418,75.094195,180.73582,14.979496,8.049381,8.876189,3.483475
std,3.423395,3.065653,0.335502,0.652726,0.273783,134.866028,5.143227,5.824608,5.128197,28.002496,6.052069,0.53321,0.638452,0.982099
min,19.0,14.0,2.95,10.359,2.914,1343.0,27.0,7.5,37.875,0.0,0.0,4.25,4.5,1.0
25%,30.0,24.0,3.3395,11.806,3.348,1811.0,75.5,96.0,72.75,165.2,10.4,7.75,8.5,3.0
50%,31.829615,25.860157,3.424,12.244,3.492,1899.0,78.0,98.71418,75.0,179.5,14.979496,8.0,8.876189,4.0
75%,34.0,28.0,3.5375,12.658,3.634,1981.0,80.5,102.0,77.25,195.0,18.9,8.5,9.25,4.0
max,43.5,38.0,9.954,14.775,6.759,2298.0,150.0,115.0,190.7,303.4,34.5,9.75,11.0,5.0


## 1) Split Train/Test Data
#### - Split train/test data
#### - Clean rest of data/nulls
#### - decision tree
#### - normalize/standardize data with outliers [0,1] - also use min max scalers
#### - random forrest - use ensemble code
#### - iterate model
#### - Find best model

In [4]:
# https://scikit-learn.org/stable/modules/tree.html#classification
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py
# https://towardsdatascience.com/data-science-mistakes-to-avoid-data-leakage-e447f88aae1c
### - FIXED DATA LEAKAGE

## 1) Split train/test data

In [5]:
# from sklearn.svm import SVC
#from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_classification
#from sklearn.model_selection import train_test_split
#from sklearn.pipeline import Pipeline
#from sklearn import tree

#X = df.drop(['bamscore', 'bam_score_rank'],axis=1) #drop bamscorerank too once I added so no data leakage
#y = df['bam_score_rank']
# y is bamscorerank, x is everything else
# now xtrain/split will take those two variables and create two datasets from it

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

X = df.drop('bamscore',axis=1) #drop bamscorerank too once I added so no data leakage
y = df['bam_score_rank']
# y is bamscorerank, x is everything else
# now xtrain/split will take those two variables and create two datasets from it

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                 random_state=0)

In [7]:
#clf = tree.DecisionTreeClassifier()
#clf = clf.fit(X_train, y_train)
# Create classifier and fit model to data

In [8]:
#clf.score(X_test, y_test)

## 2) Normalize + Scale Data to adjust for outliers
#### - going to use min/max normalization to make all points between 0-1
#### - meaning, make data all within 0-1 range. 5.5 on 0-10 scale would be .55 on normalized scale
#### - Fit base classification model

### 4) Feature Importance

In [10]:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py

In [11]:
# 5) Iterative modeling process
### What models are appropriate
### Compare Models
### Find which performance metrics to use and adjust to make the model better.

In [12]:
### Assumptions to test in model:
#### - Combine Tests more important than physical measurments. Weigh combine tests as double.
#### - just tests, just measurments, 1:1 test/measurements, hypothesis - 2:1 test/measurment

In [13]:
#feature_cols = X.columns
#clf.tree_.compute_feature_importances(normalize=False)
#feat_imp_dict = dict(zip(feature_cols, clf.feature_importances_))
#feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
#feat_imp.rename(columns = {0:'FeatureImportance'}, inplace = True)
#feat_imp.sort_values(by=['FeatureImportance'], ascending=False).head()

In [14]:
X = X.drop('bam_score_rank', axis = 1)

In [15]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

In [16]:
skb = SelectKBest(chi2, k=6)
fit = skb.fit(X,y)
features = fit.transform(X)
#X_new = SelectKBest(chi2, k=20).fit_transform(X, y)

In [17]:
mask = fit.get_support()
X[X.columns[mask]]

Unnamed: 0,approach_vertical,vertical_jump,four_way_agility,reach,weight,body_comp
0,33.500000,28.500000,11.471000,94.00000,174.40000,9.800000
1,30.500000,21.500000,12.114000,104.50000,188.40000,21.900000
2,37.000000,31.000000,12.036000,99.00000,196.50000,13.900000
3,29.000000,23.000000,12.509000,101.00000,205.00000,10.600000
4,31.000000,26.000000,12.724000,101.50000,180.00000,15.400000
...,...,...,...,...,...,...
1054,36.000000,31.000000,12.654000,88.00000,147.40000,7.900000
1055,31.500000,26.500000,11.136000,91.50000,172.10000,23.400000
1056,31.829615,25.860157,12.247189,98.71418,180.73582,14.979496
1057,31.829615,25.860157,12.247189,98.71418,180.73582,14.979496


In [18]:
#df['forway']=df['4-way_agility']

In [19]:
import statsmodels.formula.api as smf
formula = "bamscore ~ approach_vertical + vertical_jump + four_way_agility"
lm = smf.ols(formula = formula, data = df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:               bamscore   R-squared:                       0.667
Model:                            OLS   Adj. R-squared:                  0.667
Method:                 Least Squares   F-statistic:                     705.8
Date:                Sat, 23 Apr 2022   Prob (F-statistic):          1.27e-251
Time:                        19:39:18   Log-Likelihood:                -6112.8
No. Observations:                1059   AIC:                         1.223e+04
Df Residuals:                    1055   BIC:                         1.225e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          2476.2726     59.99

In [20]:
#r2 = model isn't very accurate predicting bamscore with .453 bamscore

In [21]:
import statsmodels.formula.api as smf
formula = "bamscore ~ approach_vertical + vertical_jump + reach + weight + body_comp + four_way_agility"
lm = smf.ols(formula = formula, data = df).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:               bamscore   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.672
Method:                 Least Squares   F-statistic:                     362.2
Date:                Sat, 23 Apr 2022   Prob (F-statistic):          7.29e-252
Time:                        19:39:18   Log-Likelihood:                -6102.5
No. Observations:                1059   AIC:                         1.222e+04
Df Residuals:                    1052   BIC:                         1.225e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          2615.1424     69.86

In [22]:
# body comp and weight >0.05
# I wanted to take out because it is hard to measure already, now we know we can take it out because

In [23]:
#df.corr()["bamscore"]