# Analysis

In [None]:
import pandas as pd

# Read in data and drop href and age columns (useless for analysis)
df = pd.read_csv('FinalData.csv')
df.drop(labels=['Href', 'Age'], axis=1, inplace=True)
df.PER.describe()

# Mean PER is 13.29 
# Median PER of data is 13.2
# Top 25% PER above 14.9
# Max PER of data is 65.6 (OUTLIER) (INVESTIGATE!!)

StanleyUmude = df.PER.idxmax()

# Stanley Umude has played a total of 2 minutes in the NBA to acheive a PER of 65.6 which is unrealistic
# His data will be excluded 

df.drop(StanleyUmude, inplace=True)

df.PER.describe()

# New max PER is 27.9 which is much more realistic
# Minimum PER is negative somehow

AlondesWilliams = df.PER.idxmin()

# Alondes Williams has played a total of 5 minutes registering a PER of -20.9 which is unrealistic
# His data will be excluded

df.drop(AlondesWilliams, inplace=True)

DariusJO = df.PER.idxmin()

# Darius Johnson-Odom played a total of 21 minutes registering a PER of -18.4 which is unrealistic
# His data will be excluded

df.drop(DariusJO, inplace=True)


KevinMurphy = df.PER.idxmin()

# Kevin Murphy played a total of 52 NBA minutes registering a PER of -4.5 which is unrealistic
# His data will be excluded

df.drop(KevinMurphy, inplace=True)

ChimaMoneke = df.PER.idxmin()

# Chima Moneke played 8 NBA minutes and registered a PER of -3.5 - unrealistic
# His data will be excluded

df.drop(ChimaMoneke, inplace=True)

newmin = df.PER.idxmin()
df.loc[newmin]

# New min is Will Conroy who has played over 100 minutes of NBA basketball
# His data will be included

df = df.reset_index(drop=True)
df.describe()

# Need to look into minutes

JasonCollins = df.Minutes.idxmin()
df.loc[JasonCollins]

# # 15 minutes played in college not enough - dropped

df.drop(JasonCollins, inplace=True)

GregStemsma = df.Minutes.idxmin()
df.loc[GregStemsma]

# Only played 27 mins - drop

df.drop(GregStemsma, inplace=True)

KurtThomas = df.Minutes.idxmin()
df.loc[KurtThomas]
# # 42 minutes, not enough

df.drop(KurtThomas, inplace=True)

ChrisJohnson = df.Minutes.idxmin()
df.loc[ChrisJohnson]
# 43 mins, not enough

df.drop(ChrisJohnson, inplace=True)

EdmondSumner = df.Minutes.idxmin()
df.loc[EdmondSumner]
# 43 mins, not enough

df.drop(EdmondSumner, inplace=True)

MPJ = df.Minutes.idxmin()
df.loc[MPJ]

# Michael Porter Jr. only played 53 minutes in college (due to suffering a back injury)
# He will be excluded from the data

df.drop(MPJ, inplace=True)
df.describe()

# # There are still outliers

Shamet = df.Minutes.idxmin()
df.drop(Shamet, inplace=True)

JackWhite = df.Minutes.idxmin()
df.drop(JackWhite, inplace=True)

JamesWiseman = df.Minutes.idxmin()
df.drop(JamesWiseman, inplace=True)

BrianHoward = df.Minutes.idxmin()
df.drop(BrianHoward, inplace=True)

DiVincenzo = df.Minutes.idxmin()
df.drop(DiVincenzo, inplace=True)

BradLohaus = df.Minutes.idxmin()
df.drop(BradLohaus, inplace=True)

York = df.Minutes.idxmin()
df.drop(York, inplace=True)

Nazr = df.Minutes.idxmin()
df.drop(Nazr, inplace=True)

AG = df.Minutes.idxmin()
df.drop(AG, inplace=True)

SG = df.Minutes.idxmin()
df.drop(SG, inplace=True)

df.describe()

# Now everyone that is included in the data played a minimum of 90 minutes in college, which is sufficient

# Fixing final NaNs

NAminutes = pd.isnull(df['Minutes'])
df.loc[NAminutes]
df.drop(df[NAminutes].index, inplace=True)

NAsteals=pd.isnull(df.Steals)
df.loc[NAsteals]
df.drop(df[NAsteals].index, inplace=True)

df.isna().any()

df.drop(labels=['Names'], axis=1, inplace=True)

df = df.rename(columns={'Fg%':'Fgp', '3pm':'Tpm', '3pa':'Tpa', '3p%':'Tpp', 'Ft%':'Ftp'})
df.reset_index(drop=True, inplace=True)
df.to_csv('FinalDataNoOutliers.csv', index=False, encoding='utf-8')


In [None]:
import scipy.stats as stats

# Normal Distribution

stats.skew(df.PER)

In [None]:
df = pd.read_csv('FinalDataNoOutliers.csv')

corr_matrix = df.corr()

import seaborn as sn
import matplotlib.pyplot as plt

plt.figure(figsize=(20,20))
sn.heatmap(corr_matrix, annot=True, linewidth = .5)

# Field Goal Percentage has the highest correlation with PER at 0.4
#     Second : Blocks (0.36)
#     Third : Rebounds (0.35)
# On the other hand, Three Pointers made and Three Pointers attempted has the lowest correlations with PER at -0.1 and -0.12 respectively

# Other interesting correleations:
#     Points has a 0.85 correlation with minutes, 0.85 correlation to fta, 0.97 correlation to fga, and 0.98 correlation to fgm (all logical)
#     Negatively correlated with PER:
#         Three Pointers Made
#         Three Pointers Attempted
#         Three Point Percentage (logical considering the first two)
#         and Free throw percentage
#         This means that for each increase in these variables, we should expect a decrease in PER
#     Strongest Negative Correlations:
#         Three Pointers Attempted and Field Goal Percentage have a correlation of negative 0.49
#         Three Pointers Made and Field Goal Percentage have a correlation of negative 0.43
#         Blocks and Three pointers Attempted have a correlation of negative 0.32

In [None]:
# Change out the numbers to check data entries by PER level

df = pd.read_csv('FinalDataNoOutliers.csv')
df.PER.describe()
df[(df.PER < 9)].reset_index()

Define tiers for levels of success:

Player who won't stick around in the NBA: PER less than 9

Fringe roster player: PER of 9 - 11

Will get a roster spot: PER of 11 - 13

Rotation Player: PER of 13 - 15

Good Role Player: PER of 15 - 17

All-Star: PER of 17 - 22 

Superstar: PER of 22 or more

In [None]:
# Bin PER for easier analysis

df.loc[df.PER.between(-10, 9, 'left'), 'Bin'] = 'x<9'
df.loc[df.PER.between(9, 11, 'left'), 'Bin'] = '9-11'
df.loc[df.PER.between(11, 13, 'left'), 'Bin'] = '11-13'
df.loc[df.PER.between(13, 15, 'left'), 'Bin'] = '13-15'
df.loc[df.PER.between(15, 17, 'left'), 'Bin'] = '15-17'
df.loc[df.PER.between(17, 22, 'left'), 'Bin'] = '17-22'
df.loc[df.PER.between(22, 100, 'left'), 'Bin'] = '22+'

BinOrder = pd.CategoricalDtype(categories=['x<9', '9-11', '11-13', '13-15', '15-17', '17-22', '22+'], ordered=True)
df = df
df['Bin'] = df['Bin'].astype(BinOrder)

Grouped = df.groupby('Bin')['Points'].mean()
Grouped.plot.bar()
plt.ylabel('Average Points (college)')
plt.xlabel('PER Bin')
plt.title(label='Which PER Bin Scores the Most?')
Grouped = pd.DataFrame(Grouped)
Grouped.Points = round(Grouped.Points, 2)
for i in range(len(Grouped)):
    plt.text(i, Grouped.Points[i], Grouped.Points[i], ha='center', va='bottom')

Very clear trend - Points increase at each level of PER with a massive leap in the final bin

In [None]:
Grouped2 = df.groupby('Bin')['Blocks'].mean()
Grouped2.plot.bar()
plt.ylabel('Average Blocks (college)')
plt.xlabel('PER Bin')
plt.title(label='Superstars Dominate with Blocks')
Grouped2 = pd.DataFrame(Grouped2)
Grouped2.Blocks = round(Grouped2.Blocks, 2)
for i in range(len(Grouped2)):
    plt.text(i, Grouped2.Blocks[i], Grouped2.Blocks[i], ha='center', va='bottom')

Trend is very apparent between PER and blocks - increases steadily

Players who become 'Superstars' on average get far more blocks than other players (38.66% increase from the second closest bin)

In [None]:
Grouped3 = df.groupby('Bin')['Minutes'].mean()
Grouped3.plot.bar()
plt.ylabel('Average Minutes (college)')
plt.xlabel('PER Bin')
Grouped3 = pd.DataFrame(Grouped3)
Grouped3.Minutes = round(Grouped3.Minutes, 2)
for i in range(len(Grouped3)):
    plt.text(i, Grouped3.Minutes[i], Grouped3.Minutes[i], ha='center', va='bottom')

Positive trend - weak

In [None]:
Grouped4 = df.groupby('Bin')['Assists'].mean()
Grouped4.plot.bar()
plt.ylabel('Average Assists (college)')
plt.xlabel('PER Bin')
plt.title(label='Positional Differences Skew the Picture with Assists')
Grouped4 = pd.DataFrame(Grouped4)
Grouped4.Assists = round(Grouped4.Assists, 2)
for i in range(len(Grouped4)):
    plt.text(i, Grouped4.Assists[i], Grouped4.Assists[i], ha='center', va='bottom')

No clear trend - likely due to positional needs

In [None]:
Grouped5 = df.groupby('Bin')['Steals'].mean()
Grouped5.plot.bar()
plt.ylabel('Average Steals (college)')
plt.xlabel('PER Bin')
plt.title(label='Superstars take Massive Leap with Steals')
Grouped5 = pd.DataFrame(Grouped5)
Grouped5.Steals = round(Grouped5.Steals, 2)
for i in range(len(Grouped5)):
    plt.text(i, Grouped5.Steals[i], Grouped5.Steals[i], ha='center', va='bottom')

Upward trend between steals and PER

Massive leap at our highest bin of PERs

In [None]:
Grouped5 = df.groupby('Bin')['Rebounds'].mean()
Grouped5.plot.bar()
plt.ylabel('Average Rebounds (college)')
plt.xlabel('PER Bin')
plt.title(label='Steady Upward Trend with Rebounds')
Grouped5 = pd.DataFrame(Grouped5)
Grouped5.Rebounds = round(Grouped5.Rebounds, 2)
for i in range(len(Grouped5)):
    plt.text(i, Grouped5.Rebounds[i], Grouped5.Rebounds[i], ha='center', va='bottom')

In [None]:
Grouped6 = df.groupby('Bin')['Fgp'].mean()
Grouped6.plot.bar()
plt.ylabel('Average Field-Goal Percentage (college)')
plt.xlabel('PER Bin')
plt.title(label='Field-Goal Percentage Increases with PER')
Grouped6 = pd.DataFrame(Grouped6)
Grouped6.Fgp = round(Grouped6.Fgp, 3)
for i in range(len(Grouped6)):
    plt.text(i, Grouped6.Fgp[i], Grouped6.Fgp[i], ha='center', va='bottom')

Upward trend between field goals attempted and PER

Better players make more shots

In [None]:
Grouped6 = df.groupby('Bin')['Tpp'].mean()
Grouped6.plot.bar()
plt.ylabel('Average 3-Point Percentage (college)')
plt.xlabel('PER Bin')
plt.title(label='Unclear Trend between PER and Three-Point Percentage')
Grouped6 = pd.DataFrame(Grouped6)
Grouped6['Tpp'] = round(Grouped6['Tpp'], 3)
for i in range(len(Grouped6)):
    plt.text(i, Grouped6['Tpp'][i], Grouped6['Tpp'][i], ha='center', va='bottom')

Strongest Negative correlation with PER

Clear negative trend in data until final bin

Superstars make more three pointers than any other bin

# Check for multi-collinearity

In [None]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, X = dmatrices('PER ~ Games+Minutes+Points+Rebounds+Assists+Steals+Blocks+Fgm+Fga+Fgp+Tpm+Tpa+Tpp+Ftm+Fta+Ftp', data = df, return_type = 'dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df


Minutes seems to be a slight issue (VIF > 5)

Redundant variables - fgm, fga, tpm, tpa, ftm, fta

# Fixing multi-collinearity

In [None]:
y, X = dmatrices('PER ~ Games+Points+Rebounds+Assists+Steals+Blocks+Fgp+Tpp+Ftp', data = df, return_type = 'dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df

Removing variables from data lowers VIFs

No VIF over 5 -- GOOD

In [None]:
ActualPER = df.pop('PER')
ActualPER.max()

In [None]:
df = df.drop(['Minutes', 'Fgm', 'Fga', 'Fta', 'Tpm', 'Tpa', 'Ftm', 'Bin'], axis=1)

In [None]:
df

In [None]:
prospects = pd.read_csv('ProspectStats.csv')
prospect_names = prospects.Player
prospects = prospects.drop(['Player', 'Team', 'MIN', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'ORB', 'DRB'], axis=1)
prospects = prospects.rename(columns={'GP':'Games', 'PTS':'Points', 'REB':'Rebounds', 'AST':'Assists', 'STL':'Steals', 'BLK':'Blocks', 'FG%':'Fgp', '3P%':'Tpp', 'FT%':'Ftp'})
prospects

In [None]:
prospects = prospects.iloc[:,[0,1,5,6,7,8,2,3,4]]
prospects

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df, ActualPER, test_size = 0.25)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

l = LinearRegression()
lr = GridSearchCV(l, {'fit_intercept':[True, False], 'positive':[True, False]})
lr.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(lr.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = lr.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Linear Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

R squared of 0.2624 - third highest

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = lr.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
pd.set_option('display.max_rows', 102)
df_prospectpred.reset_index(inplace=True)
df_prospectpred

# Neural Network MLP Regressor

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

M = MLPRegressor()
MLP = GridSearchCV(M, {'alpha':[.00001, .0001, .001, .01, .1], 'tol':[.00001, .0001, .001, .01, .1], })
MLP.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(MLP.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = MLP.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Neural Network Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

## Root mean squared error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean absolute error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = MLP.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

t = DecisionTreeRegressor()
tree = GridSearchCV(t, {'max_depth':[1,2,3,None], 'min_samples_split': [6,7,8,9,None], 'min_samples_leaf':[7,8,9,10,None], 'ccp_alpha':[.0005,.005,.05,.1]})
tree.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(tree.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = tree.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Decision Tree')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

R squared of .1163 

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

Horrible fit

## Prospect Prediction

In [None]:
p_pred = tree.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
r = RandomForestRegressor()
rf = GridSearchCV(r, {'n_estimators':[50,100,200,300], 'max_depth':[None,75,100,200], 'min_samples_split':[2,3,4,5], 'min_samples_leaf':[1,3,5,7,9]})
rf.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(rf.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = rf.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Random Forest Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

Issue with random forest: two distinct groups

One group that is underpredicted and one that is overpredicted

Random Forest Model doesn't seem to predict very high or very low PERs (Most predicted values are between 10 and 17)

## R Squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

R squared of 0.2247 - better than decision tree, but not in the top 5

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = rf.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# KNN

In [None]:
from sklearn import neighbors, model_selection
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

k = KNeighborsRegressor()
knn = GridSearchCV(k, {'n_neighbors':[3,5,10,40], 'p':[1,2,5,7,10], 'leaf_size':[1,4,7,10,20,30]})
knn.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(knn.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = knn.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('KNN Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R Squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

R squared of .1503 - not great

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = knn.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# SVR

In [None]:
from sklearn import svm
from sklearn.svm import SVR

s = SVR()
svr = GridSearchCV(s, {'degree':[1,2,3,4], 'gamma':[.005,.01,.05,.1], 'tol':[.0001, .001, .01, .1], 'C':[1,3,5,7,9]})
svr.fit(x_train, y_train)

In [None]:
id = pd.DataFrame(svr.cv_results_)
idx = id['rank_test_score'].idxmin()
id.iloc[idx].params

In [None]:
y_pred = svr.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('SVR Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred)

Horrible - .0162

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = svr.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# XGBoost

In [None]:
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

x = GradientBoostingRegressor()
xgb = GridSearchCV(x, {'learning_rate':[.05,.1,.3],'n_estimators':[10,50,100], 'min_samples_split':[2,3], 'min_weight_fraction_leaf':[0,.05,.1,.2], 'max_depth':[5,6], 'alpha':[.1,.3,], 'tol':[.0001, .001]})
xgb.fit(x_train, y_train)

In [None]:
res = pd.DataFrame(xgb.cv_results_)
minres = res['rank_test_score'].idxmin()
res.iloc[minres].params

In [None]:
y_pred = xgb.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('XGBoost Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred) 

fourth best - .2417

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = xgb.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# Bayesian Ridge Regression

In [None]:
from sklearn import linear_model
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV

rid = BayesianRidge()
ridge = GridSearchCV(rid, {'tol':[.0001, .001, .01, .1], 'alpha_1':[.000001, .00001, .0001, .001, .01, .1], 'alpha_2':[.000001, .00001, .0001, .001, .01, .1], 'lambda_1':[.000001, .00001, .0001, .001, .01, .1], 'lambda_2':[.000001, .00001, .0001, .001, .01, .1]})
ridge.fit(x_train, y_train)

In [None]:
res = pd.DataFrame(ridge.cv_results_)
minres = res['rank_test_score'].idxmin()
res.iloc[minres].params

In [None]:
y_pred = ridge.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Bayesian Ridge Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred) 

fifth best - .2366

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = ridge.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
df_prospectpred

# Lasso Regression

In [None]:
from sklearn import linear_model
from sklearn.linear_model import Lasso

las = Lasso()
lasso = GridSearchCV(las, {'alpha':[.0001,.001,.01,.1,1.0,2.0], 'tol':[.0001,.001,.01,.1]})
lasso.fit(x_train, y_train)

In [None]:
res = pd.DataFrame(lasso.cv_results_)
minres = res['rank_test_score'].idxmin()
res.iloc[minres].params

In [None]:
y_pred = lasso.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Lasso Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred) 

second best - .2726

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = lasso.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
pd.set_option('display.max_rows', 102)
df_prospectpred.reset_index(inplace=True)
df_prospectpred

# Quantile Regression

In [None]:
from sklearn import linear_model
from sklearn.linear_model import ElasticNet

e = ElasticNet()
en = GridSearchCV(e, {'alpha':[.0001,.001,.01,.1], 'l1_ratio':[.01,.1,.5,.9], 'tol':[.0001,.001,.01,.1,.5,.9]})
en.fit(x_train, y_train)

In [None]:
res = pd.DataFrame(en.cv_results_)
minres = res['rank_test_score'].idxmin()
res.iloc[minres].params

In [None]:
y_pred = en.predict(x_test)
y_pred = pd.DataFrame(y_pred, columns=['yPredict'])
y_pred

## Scatter Plot

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_test, y_pred, label = 'Predicted vs Actual PERs')
plt.legend()
plt.grid()
plt.title('Elastic Net Regression')
plt.xlabel('Actual PER')
plt.ylabel('Predicted PER')
plt.show()

## R squared

In [None]:
from sklearn import metrics

metrics.r2_score(y_test, y_pred) 

BEST - .2729

## Root Mean Squared Error

In [None]:
import math

math.sqrt(metrics.mean_squared_error(y_test, y_pred))

## Mean Absolute Error

In [None]:
metrics.mean_absolute_error(y_test, y_pred)

## Prospect Prediction

In [None]:
p_pred = en.predict(prospects)
p_pred = pd.DataFrame(p_pred, columns=['pPredict'])
df_prospectpred = pd.concat([prospect_names,p_pred], axis=1)
df_prospectpred.sort_values('pPredict', ascending=False, inplace=True)
pd.set_option('display.max_rows', 102)
df_prospectpred.reset_index(inplace=True)
df_prospectpred