In [1]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from pathlib import Path  

In [2]:
mm_teams_merged_df = pd.read_csv("../Module 20 NCAA stats/mm_teams_merged.csv")
mm_teams_merged_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Conference_Pac-12,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Akron,278,177,162,70,119,192,272,64,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alabama,66,185,36,184,200,304,132,76,320,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arizona,1,17,8,4,10,319,95,9,137,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arkansas,116,110,71,218,58,323,32,89,145,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Define the features set.
X = mm_teams_merged_df.drop(columns= ["Team", "Tournament Wins"])
X.head()

Unnamed: 0,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,...,Conference_Pac-12,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC
0,44,141,343,156,90,349,100,51,200,77,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,278,177,162,70,119,192,272,64,22,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,66,185,36,184,200,304,132,76,320,150,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,17,8,4,10,319,95,9,137,3,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,110,71,218,58,323,32,89,145,53,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Define the target set.
y = mm_teams_merged_df["Tournament Wins"]
y.head()

0    0
1    0
2    0
3    2
4    3
Name: Tournament Wins, dtype: int64

### check with and without stratify

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [6]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
X_train_scaled

array([[-1.23395418, -1.11984732,  0.10871923, ..., -0.11855895,
        -0.1268723 , -0.18088625],
       [-0.27193664,  0.08646749,  0.16320832, ..., -0.11855895,
        -0.1268723 , -0.18088625],
       [ 1.3543311 ,  0.03620437,  1.7869833 , ..., -0.11855895,
        -0.1268723 , -0.18088625],
       ...,
       [-0.40936772, -0.99418952,  0.65361016, ..., -0.11855895,
        -0.1268723 , -0.18088625],
       [-0.68422987, -0.73030816,  0.07602577, ..., -0.11855895,
        -0.1268723 , -0.18088625],
       [ 0.51829205,  0.11159905,  0.19590178, ..., -0.11855895,
        -0.1268723 , -0.18088625]])

### apply random sampling

In [8]:
from collections import Counter

In [9]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({0: 219, 1: 219, 4: 219, 2: 219, 6: 219, 3: 219, 5: 219})

### apply_model

Gradient Boosting Classifier

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_train_scaled, y_train.ravel)

In [None]:
  print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

In [None]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [None]:
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

In [11]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [13]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6"], columns=["Predicted 0", "Predicted 1", "Predicted 2","Predicted 3", "Predicted 4","Predicted 5", "Predicted 6"])

cm_df


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 0,39,24,1,5,2,0,2
Actual 1,22,17,8,3,1,0,0
Actual 2,8,4,3,3,2,0,2
Actual 3,4,2,4,1,0,1,0
Actual 4,1,1,1,1,1,0,0
Actual 5,0,1,2,0,0,0,0
Actual 6,2,0,0,1,0,0,0


In [14]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 0,39,24,1,5,2,0,2
Actual 1,22,17,8,3,1,0,0
Actual 2,8,4,3,3,2,0,2
Actual 3,4,2,4,1,0,1,0
Actual 4,1,1,1,1,1,0,0
Actual 5,0,1,2,0,0,0,0
Actual 6,2,0,0,1,0,0,0


Accuracy Score : 0.3609467455621302
Classification Report
              precision    recall  f1-score   support

           0       0.51      0.53      0.52        73
           1       0.35      0.33      0.34        51
           2       0.16      0.14      0.15        22
           3       0.07      0.08      0.08        12
           4       0.17      0.20      0.18         5
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         3

    accuracy                           0.36       169
   macro avg       0.18      0.18      0.18       169
weighted avg       0.36      0.36      0.36       169

