In [69]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

## Question
Target Variable = Wins
Modeling wins based on counting stats

In [2]:
# Initialize dataframe
big_east_df = pd.read_csv('./data/bigeastncaabasketball.csv')

In [3]:
# Shape
big_east_df.shape

(496, 37)

In [4]:
# Quick Look
big_east_df.head()

Unnamed: 0,id,year,rank,school,games,wins,losses,win_percentage,conference_wins,conference_losses,...,offensive_rebounds,total_rebounds,assists,steals,blocks,turnovers,personal_fouls,points,opponent_points,simple_rating
0,1,2021,1,Villanova,38,30,8,0.789,16,4,...,10.3,34.8,11.9,6.0,2.2,9.9,14.9,71.7,62.7,19.31
1,2,2021,2,Providence,33,27,6,0.818,14,3,...,10.5,37.6,13.2,5.0,3.7,11.4,16.0,71.5,66.2,13.08
2,3,2021,3,UConn,33,23,10,0.697,13,6,...,13.8,40.4,14.0,5.9,6.4,11.8,16.8,74.8,65.3,16.4
3,4,2021,4,Creighton,35,23,12,0.657,12,7,...,9.6,38.1,13.3,5.5,4.3,14.1,13.6,69.2,66.4,11.34
4,5,2021,5,Marquette,32,19,13,0.594,11,8,...,7.8,34.8,16.0,7.8,5.2,12.4,17.4,74.0,71.6,11.36


## EDA

In [5]:
# Columns
big_east_df.columns

Index(['id', 'year', 'rank', 'school', 'games', 'wins', 'losses',
       'win_percentage', 'conference_wins', 'conference_losses', 'home_wins',
       'home_losses', 'away_wins', 'away_losses', 'offensive_rating',
       'defensive_rating', 'net_rating', 'field_goals', 'field_goal_attempts',
       'field_goal_percentage', '3_pointers', '3_pointer_attempts',
       '3_pointer_percentage', 'effective_field_goal_percentage',
       'free_throws', 'free_throw_attempts', 'free_throw_percentage',
       'offensive_rebounds', 'total_rebounds', 'assists', 'steals', 'blocks',
       'turnovers', 'personal_fouls', 'points', 'opponent_points',
       'simple_rating'],
      dtype='object')

In [6]:
big_east_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 37 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               496 non-null    int64  
 1   year                             496 non-null    int64  
 2   rank                             496 non-null    int64  
 3   school                           496 non-null    object 
 4   games                            496 non-null    int64  
 5   wins                             496 non-null    int64  
 6   losses                           496 non-null    int64  
 7   win_percentage                   496 non-null    float64
 8   conference_wins                  496 non-null    int64  
 9   conference_losses                496 non-null    int64  
 10  home_wins                        288 non-null    float64
 11  home_losses                      288 non-null    float64
 12  away_wins             

In [7]:
columns_to_drop = ['id', 'year', 'rank', 'school', 'games', 'losses', 'win_percentage', 'conference_wins', 'conference_losses',
                   'home_wins', 'home_losses', 'away_wins', 'away_losses', 'simple_rating', 'personal_fouls', 'opponent_points',
                  'offensive_rating', 'defensive_rating', 'net_rating', 'offensive_rebounds']

In [8]:
big_east_df.drop(columns=columns_to_drop, inplace=True)

In [9]:
big_east_df.head()

Unnamed: 0,wins,field_goals,field_goal_attempts,field_goal_percentage,3_pointers,3_pointer_attempts,3_pointer_percentage,effective_field_goal_percentage,free_throws,free_throw_attempts,free_throw_percentage,total_rebounds,assists,steals,blocks,turnovers,points
0,30,24.3,56.1,0.432,9.3,26.0,0.359,0.515,13.9,16.7,0.83,34.8,11.9,6.0,2.2,9.9,71.7
1,27,24.5,56.3,0.435,7.4,21.4,0.344,0.5,15.2,20.8,0.728,37.6,13.2,5.0,3.7,11.4,71.5
2,23,26.7,61.2,0.435,7.4,21.1,0.352,0.496,14.1,18.7,0.753,40.4,14.0,5.9,6.4,11.8,74.8
3,23,25.6,57.2,0.448,6.7,21.8,0.308,0.506,11.3,15.3,0.738,38.1,13.3,5.5,4.3,14.1,69.2
4,19,26.9,59.8,0.45,8.6,25.0,0.344,0.521,11.6,15.7,0.741,34.8,16.0,7.8,5.2,12.4,74.0


In [10]:
# Check for Nulls and Datatypes

In [11]:
big_east_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   wins                             496 non-null    int64  
 1   field_goals                      496 non-null    float64
 2   field_goal_attempts              496 non-null    float64
 3   field_goal_percentage            496 non-null    float64
 4   3_pointers                       438 non-null    float64
 5   3_pointer_attempts               438 non-null    float64
 6   3_pointer_percentage             438 non-null    float64
 7   effective_field_goal_percentage  438 non-null    float64
 8   free_throws                      496 non-null    float64
 9   free_throw_attempts              496 non-null    float64
 10  free_throw_percentage            496 non-null    float64
 11  total_rebounds                   496 non-null    float64
 12  assists               

In [12]:
# Dropna
big_east_df.dropna(inplace=True)

In [14]:
# Recheck
big_east_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437 entries, 0 to 471
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   wins                             437 non-null    int64  
 1   field_goals                      437 non-null    float64
 2   field_goal_attempts              437 non-null    float64
 3   field_goal_percentage            437 non-null    float64
 4   3_pointers                       437 non-null    float64
 5   3_pointer_attempts               437 non-null    float64
 6   3_pointer_percentage             437 non-null    float64
 7   effective_field_goal_percentage  437 non-null    float64
 8   free_throws                      437 non-null    float64
 9   free_throw_attempts              437 non-null    float64
 10  free_throw_percentage            437 non-null    float64
 11  total_rebounds                   437 non-null    float64
 12  assists               

## Train Test Split, Scale

In [17]:
# Initialize Variables
y = big_east_df['wins']
X = big_east_df.drop('wins', axis=1)

In [18]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [37]:
# Instantiate scaler, fit and transform data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Linear Regression Model

In [39]:
# Instantiate LR Model
lr_model = LinearRegression()

# Fit model onto training data
lr_model = lr_model.fit(X_train_scaled, y_train)

In [41]:
# Prediction Variables 
y_train_prediction = lr_model.predict(X_train_scaled)
y_test_prediction = lr_model.predict(X_test_scaled)

In [44]:
# Evaluate model using RMSE and R2
train_rmse = mean_squared_error(y_train, y_train_prediction, squared=False)
test_rmse = mean_squared_error(y_test, y_test_prediction, squared=False)

train_r2 = r2_score(y_train, y_train_prediction)
test_r2 = r2_score(y_test, y_test_prediction)

In [46]:
# Prints
print(f"""
RMSE
Train: {train_rmse} \t Test: {test_rmse}

R-squared
Train: {train_r2} \t Test: {test_r2}
""")


RMSE
Train: 3.3667184112128936 	 Test: 3.4593678495927223

R-squared
Train: 0.7001536156012553 	 Test: 0.7111216838784009



## Ridge Model

In [64]:
# Instantiate Ridge Model
ridge_model = Ridge(alpha=1, solver='sag', random_state=42)

# Fit model onto training data
ridge_model = ridge_model.fit(X_train_scaled, y_train)

In [65]:
# Prediction Vars
y_train_prediction_r = ridge_model.predict(X_train_scaled)
y_test_prediction_r = ridge_model.predict(X_test_scaled)

In [66]:
# Evaluate model using RMSE and R2
train_rmse_r = mean_squared_error(y_train, y_train_prediction_r, squared=False)
test_rmse_r = mean_squared_error(y_test, y_test_prediction_r, squared=False)

train_r2_r = r2_score(y_train, y_train_prediction_r)
test_r2_r = r2_score(y_test, y_test_prediction_r)

In [67]:
# Prints
print(f"""
RMSE
Train: {train_rmse_r} \t Test: {test_rmse_r}

R-squared
Train: {train_r2_r} \t Test: {test_r2_r}
""")


RMSE
Train: 3.3770215756520177 	 Test: 3.430993645286054

R-squared
Train: 0.6983155683281917 	 Test: 0.7158410878418255



## Lasso

In [71]:
# Instantiate Lasso Model
lasso_model = Lasso(random_state=42)

# Fit model onto training data
lasso_model = lasso_model.fit(X_train_scaled, y_train)

In [72]:
# Prediction Vars
y_train_prediction_l = lasso_model.predict(X_train_scaled)
y_test_prediction_l = lasso_model.predict(X_test_scaled)

In [73]:
# Evaluate model using RMSE and R2
train_rmse_l = mean_squared_error(y_train, y_train_prediction_l, squared=False)
test_rmse_l = mean_squared_error(y_test, y_test_prediction_l, squared=False)

train_r2_l = r2_score(y_train, y_train_prediction_l)
test_r2_l = r2_score(y_test, y_test_prediction_l)

In [74]:
# Prints
print(f"""
RMSE
Train: {train_rmse_l} \t Test: {test_rmse_l}

R-squared
Train: {train_r2_l} \t Test: {test_r2_l}
""")


RMSE
Train: 4.560127135201716 	 Test: 4.891443445023222

R-squared
Train: 0.44990318534510554 	 Test: 0.42244210180342423

