In [19]:
import pandas as pd
from scipy.stats import linregress
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [2]:
clean_data_df=pd.read_csv("2011-22_clean_NCAA_stats.csv")

In [3]:
# Set independent and dependent variables.
ast_per_game = clean_data_df['Assists Per. Game Rank']
ast_to_ratio = clean_data_df['Assist Turnover Ratio Rank']
blocks_per_game = clean_data_df['BlocksPerGame Rank']
fieldgoalprc = clean_data_df['Field Goal % Rank']
fieldgoalprcdef = clean_data_df['Field Goal % Defense Rank']
fouls_per_game = clean_data_df['Fouls Per Game Rank']
ftprc = clean_data_df['Free Throw % Rank']
reb_marg = clean_data_df['Rebound Margin Rank']
scoring_def = clean_data_df['Scoring Defense Rank']
scoring_margin = clean_data_df['Scoring Margin Rank']
scoring_off = clean_data_df['Scoring Offense Rank']
stls_per_game = clean_data_df['Steals Per Game Rank']
threept_fg_def = clean_data_df['Three Pt FG Defense Rank']
threept_fg_prc = clean_data_df['Three Pt FG % Rank']
threept_fg_per_game = clean_data_df['Three Pt FG Per Game Rank']
to_margin = clean_data_df['Turnover Margin Rank']
to_per_game = clean_data_df['Turnover Per Game Rank']
wl = clean_data_df['Win-Loss Rank']
tournament_wins = clean_data_df['Tournament Wins']
clean_data_df['Three Pt FG % Rank'] = clean_data_df['Three Pt FG % Rank'].fillna(0).astype("int64") 
clean_data_df['Turnover Margin Rank'] = clean_data_df['Turnover Margin Rank'].fillna(0).astype("int64") 

In [4]:
X=clean_data_df.drop(['Conference','Tournament Wins','Team'],axis=1)
y=clean_data_df['Tournament Wins'].values

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 8, 1: 8, 2: 8, 3: 8, 4: 8, 5: 8, 6: 8})

In [8]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1, solver='liblinear')

In [9]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.4214963119072708

In [10]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.45      0.97      0.62      0.66      0.42       854
          1       0.08      0.12      0.91      0.10      0.34      0.10        56
          2       0.03      0.33      0.74      0.05      0.50      0.24        21
          3       0.00      0.00      0.91      0.00      0.00      0.00        10
          4       0.00      0.00      0.96      0.00      0.00      0.00         4
          5       0.00      0.00      0.91      0.00      0.00      0.00         1
          6       0.00      0.00      0.98      0.00      0.00      0.00         3

avg / total       0.90      0.42      0.96      0.57      0.63      0.38       949



In [13]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled2, y_resampled2 = cc.fit_resample(X_train, y_train)

In [21]:
model2 = LogisticRegression(solver='liblinear', random_state=1)
model2.fit(X_resampled2, y_resampled2)

LogisticRegression(random_state=1, solver='liblinear')

In [22]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5753424657534246

In [23]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.63      0.80      0.76      0.71      0.50       854
          1       0.04      0.07      0.90      0.05      0.25      0.06        56
          2       0.05      0.14      0.94      0.08      0.37      0.12        21
          3       0.00      0.00      0.93      0.00      0.00      0.00        10
          4       0.00      0.00      0.95      0.00      0.00      0.00         4
          5       0.00      0.00      0.90      0.00      0.00      0.00         1
          6       0.00      0.00      0.97      0.00      0.00      0.00         3

avg / total       0.87      0.58      0.81      0.69      0.66      0.45       949



In [24]:
smote_enn = SMOTEENN(random_state=0)
X_resampled3, y_resampled3 = smote_enn.fit_resample(X_train, y_train)

In [26]:
model3 = LogisticRegression(solver='liblinear', random_state=1)
model3.fit(X_resampled3, y_resampled3)

LogisticRegression(random_state=1, solver='liblinear')

In [28]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5753424657534246

In [29]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.63      0.80      0.76      0.71      0.50       854
          1       0.04      0.07      0.90      0.05      0.25      0.06        56
          2       0.05      0.14      0.94      0.08      0.37      0.12        21
          3       0.00      0.00      0.93      0.00      0.00      0.00        10
          4       0.00      0.00      0.95      0.00      0.00      0.00         4
          5       0.00      0.00      0.90      0.00      0.00      0.00         1
          6       0.00      0.00      0.97      0.00      0.00      0.00         3

avg / total       0.87      0.58      0.81      0.69      0.66      0.45       949

