In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.datasets import make_blobs
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from pathlib import Path  
from sklearn.ensemble import RandomForestClassifier

In [2]:
master_clean_df = pd.read_csv("../Module 20 NCAA stats/NCAA_master_clean.csv")
master_clean_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,45,26,56,190,266,44,296,75,0,Southland
1,Abilene Christian,52,76,338,194,204,350,52,284,126,...,44,2,186,262,229,1,178,60,0,WAC
2,Air Force,206,265,190,203,318,163,339,342,78,...,347,284,57,210,195,235,272,259,0,Mountain West
3,Akron,278,177,162,70,119,192,272,64,22,...,187,251,228,78,131,162,63,51,0,MAC
4,Alabama A&M,347,346,164,349,27,73,179,266,107,...,343,113,93,348,349,180,314,255,0,SWAC


In [3]:
master_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3793 entries, 0 to 3792
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        3793 non-null   object
 1   Assists Per. Game Rank      3793 non-null   int64 
 2   Assist Turnover Ratio Rank  3793 non-null   int64 
 3   BlocksPerGame Rank          3793 non-null   int64 
 4   Field Goal % Rank           3793 non-null   int64 
 5   Field Goal % Defense Rank   3793 non-null   int64 
 6   Fouls Per Game Rank         3793 non-null   int64 
 7   Free Throw % Rank           3793 non-null   int64 
 8   Rebound Margin Rank         3793 non-null   int64 
 9   Scoring Defense Rank        3793 non-null   int64 
 10  Scoring Margin Rank         3793 non-null   int64 
 11  Scoring Offense Rank        3793 non-null   int64 
 12  Steals Per Game Rank        3793 non-null   int64 
 13  Three Pt FG Defense Rank    3793 non-null   int6

In [4]:
# Generate our categorical variable list
object_column_list= master_clean_df.dtypes[master_clean_df.dtypes == "object"].index.tolist()

In [5]:
object_column_list

['Team', 'Conference']

In [6]:
# Check the number of unique values in each column
master_clean_df[object_column_list].nunique()

Team          356
Conference     39
dtype: int64

In [7]:
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(master_clean_df["Conference"].values.reshape(-1,1)))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(["Conference"])
encode_df.head()

Unnamed: 0,Conference_AAC,Conference_ACC,Conference_ASUN,Conference_America East,Conference_Atlantic 10,Conference_Big 12,Conference_Big East,Conference_Big Sky,Conference_Big South,Conference_Big Ten,...,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_UTRGV,Conference_Utah Valley,Conference_WAC,Conference_WCC
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Merge the two DataFrames together and drop the Country column
master_clean_merged_df = master_clean_df.merge(encode_df,left_index=True,right_index=True).drop("Conference",1)
master_clean_merged_df.head()

  


Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_UTRGV,Conference_Utah Valley,Conference_WAC,Conference_WCC
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abilene Christian,52,76,338,194,204,350,52,284,126,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Air Force,206,265,190,203,318,163,339,342,78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Akron,278,177,162,70,119,192,272,64,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alabama A&M,347,346,164,349,27,73,179,266,107,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Define the features set.
X = master_clean_merged_df.drop(columns= ["Team", "Tournament Wins"])
X.head()

Unnamed: 0,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,...,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_UTRGV,Conference_Utah Valley,Conference_WAC,Conference_WCC
0,44,141,343,156,90,349,100,51,200,77,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,76,338,194,204,350,52,284,126,33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,206,265,190,203,318,163,339,342,78,314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,278,177,162,70,119,192,272,64,22,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,347,346,164,349,27,73,179,266,107,299,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3793 entries, 0 to 3792
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Assists Per. Game Rank      3793 non-null   int64  
 1   Assist Turnover Ratio Rank  3793 non-null   int64  
 2   BlocksPerGame Rank          3793 non-null   int64  
 3   Field Goal % Rank           3793 non-null   int64  
 4   Field Goal % Defense Rank   3793 non-null   int64  
 5   Fouls Per Game Rank         3793 non-null   int64  
 6   Free Throw % Rank           3793 non-null   int64  
 7   Rebound Margin Rank         3793 non-null   int64  
 8   Scoring Defense Rank        3793 non-null   int64  
 9   Scoring Margin Rank         3793 non-null   int64  
 10  Scoring Offense Rank        3793 non-null   int64  
 11  Steals Per Game Rank        3793 non-null   int64  
 12  Three Pt FG Defense Rank    3793 non-null   int64  
 13  Three Pt FG % Rank          3793 

In [12]:
# Define the target set.
y = master_clean_merged_df["Tournament Wins"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Tournament Wins, dtype: int64

In [13]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
X_train_scaled

array([[ 1.49453707,  1.60540635, -0.96622317, ..., -0.01875476,
        -0.1553279 , -0.17338273],
       [-0.59452587, -0.27693769, -1.59782951, ..., -0.01875476,
        -0.1553279 , -0.17338273],
       [ 1.05473434, -0.54727433,  0.02630107, ..., -0.01875476,
        -0.1553279 , -0.17338273],
       ...,
       [ 0.52497197, -0.3069751 , -0.94617218, ..., -0.01875476,
        -0.1553279 , -0.17338273],
       [ 1.23465364,  1.32505723,  0.6779584 , ..., -0.01875476,
        -0.1553279 , -0.17338273],
       [-0.18470969,  0.72430914, -1.16673312, ..., -0.01875476,
        -0.1553279 , -0.17338273]])

In [16]:
# different step
# Create a random forest classifier. keep n_estimators between 64-128; 
# changing to 500 changes results very little
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [17]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [19]:
predictions.shape

(949,)

In [20]:
master_clean_df.shape

(3793, 21)

In [21]:
X_test.shape

(949, 57)

In [22]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [23]:
X_test['Prediction'] = predictions
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,...,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_UTRGV,Conference_Utah Valley,Conference_WAC,Conference_WCC,Prediction
1266,275,278,276,297,289,106,209,243,24,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2404,132,173,20,33,12,244,170,127,15,39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
510,188,81,297,193,250,145,89,270,291,215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
409,290,331,105,324,301,144,131,319,338,334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2841,68,106,172,235,335,253,199,329,320,313,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [24]:
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay


In [25]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
# confusion_matrix(y_test, predictions).ravel()
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6"], columns=["Predicted 0", "Predicted 1", "Predicted 2","Predicted 3", "Predicted 4","Predicted 5", "Predicted 6"])

cm_df
# cm
# true_negative, false_positive, false_negative, true_positive

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 0,853,0,1,0,0,0,0
Actual 1,55,1,0,0,0,0,0
Actual 2,19,2,0,0,0,0,0
Actual 3,9,0,0,1,0,0,0
Actual 4,4,0,0,0,0,0,0
Actual 5,1,0,0,0,0,0,0
Actual 6,3,0,0,0,0,0,0


In [26]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [27]:
# Displaying results
print("Confusion Matrix")
# display(cm_df)
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


array([[853,   0,   1,   0,   0,   0,   0],
       [ 55,   1,   0,   0,   0,   0,   0],
       [ 19,   2,   0,   0,   0,   0,   0],
       [  9,   0,   0,   1,   0,   0,   0],
       [  4,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0,   0],
       [  3,   0,   0,   0,   0,   0,   0]], dtype=int64)

Accuracy Score : 0.9009483667017913
Classification Report
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       854
           1       0.33      0.02      0.03        56
           2       0.00      0.00      0.00        21
           3       1.00      0.10      0.18        10
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3

    accuracy                           0.90       949
   macro avg       0.32      0.16      0.17       949
weighted avg       0.84      0.90      0.86       949



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# new step
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([3.94469492e-02, 6.07766155e-02, 5.72179112e-02, 4.41557509e-02,
       5.85847849e-02, 4.29954440e-02, 3.98803432e-02, 5.42597245e-02,
       4.48884458e-02, 8.40550832e-02, 4.50804317e-02, 4.14516944e-02,
       4.23534509e-02, 3.59199271e-02, 3.81255030e-02, 4.33666496e-02,
       4.19441424e-02, 8.07443616e-02, 4.80954048e-03, 1.07998022e-02,
       1.01689224e-03, 8.07702224e-04, 3.82724966e-03, 1.29589165e-02,
       7.38248346e-03, 2.48831667e-04, 5.13256987e-04, 2.17273630e-02,
       9.53318087e-04, 9.87394056e-04, 7.84954851e-04, 0.00000000e+00,
       0.00000000e+00, 1.00944034e-03, 1.52215954e-07, 1.37338076e-03,
       0.00000000e+00, 6.09545597e-04, 7.43187594e-04, 1.61051057e-03,
       1.21370959e-03, 1.36842174e-03, 1.86048556e-03, 0.00000000e+00,
       6.19454320e-04, 1.05121004e-02, 1.73299559e-04, 1.07987524e-02,
       1.62087750e-03, 3.14598226e-04, 4.52683309e-04, 3.87253970e-04,
       6.47124767e-04, 0.00000000e+00, 4.63138756e-07, 5.47282045e-04,
      

In [29]:
# new step
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.08405508324041108, 'Scoring Margin Rank'),
 (0.08074436155305346, 'Win-Loss Rank'),
 (0.060776615545311725, 'Assist Turnover Ratio Rank'),
 (0.05858478488425158, 'Field Goal % Defense Rank'),
 (0.05721791119794502, 'BlocksPerGame Rank'),
 (0.054259724516192105, 'Rebound Margin Rank'),
 (0.04508043165632425, 'Scoring Offense Rank'),
 (0.044888445827964134, 'Scoring Defense Rank'),
 (0.044155750887833814, 'Field Goal % Rank'),
 (0.043366649624348455, 'Turnover Margin Rank'),
 (0.042995444003722016, 'Fouls Per Game Rank'),
 (0.04235345094446517, 'Three Pt FG Defense Rank'),
 (0.0419441424496612, 'Turnover Per Game Rank'),
 (0.04145169438357474, 'Steals Per Game Rank'),
 (0.039880343202077354, 'Free Throw % Rank'),
 (0.039446949195979894, 'Assists Per. Game Rank'),
 (0.03812550295173962, 'Three Pt FG Per Game Rank'),
 (0.035919927118220875, 'Three Pt FG % Rank'),
 (0.02172736301746577, 'Conference_Big Ten'),
 (0.012958916480219414, 'Conference_Big 12'),
 (0.010799802196060158, 'Confere

### clean file so that only tournament teams are represented, drop all other teams; prevent skewing by large amounts of 0s

In [55]:
tournament_teams_df = pd.read_csv("../Module 20 NCAA stats/tournament_teams_only.csv")
tournament_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins
0,A&M-Corpus Christi (Southland),44,141,343,156,90,349,100,51,200,77,45,26,56,190,266,44,296,75,0.0
1,Abilene Christian (WAC),52,76,338,194,204,350,52,284,126,33,44,2,186,262,229,1,178,60,
2,Air Force (Mountain West),206,265,190,203,318,163,339,342,78,314,347,284,57,210,195,235,272,259,
3,Akron (MAC),278,177,162,70,119,192,272,64,22,46,187,251,228,78,131,162,63,51,0.0
4,Alabama A&M (SWAC),347,346,164,349,27,73,179,266,107,299,343,113,93,348,349,180,314,255,


In [56]:
tournament_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        3794 non-null   object
 1   Assists Per. Game Rank      3794 non-null   object
 2   Assist Turnover Ratio Rank  3794 non-null   object
 3   BlocksPerGame Rank          3794 non-null   object
 4   Field Goal % Rank           3794 non-null   object
 5   Field Goal % Defense Rank   3794 non-null   object
 6   Fouls Per Game Rank         3794 non-null   object
 7   Free Throw % Rank           3794 non-null   object
 8   Rebound Margin Rank         3794 non-null   object
 9   Scoring Defense Rank        3794 non-null   object
 10  Scoring Margin Rank         3794 non-null   object
 11  Scoring Offense Rank        3794 non-null   object
 12  Steals Per Game Rank        3794 non-null   object
 13  Three Pt FG Defense Rank    3794 non-null   obje

In [57]:
# Extract conference name out of Team Column: (\((\w*?\s*-*\w*?)\)$)
tournament_teams_df[["index",'Conference']] = tournament_teams_df['Team'].str.extract(r'(\((\w*?\s*-*\w*?)\)$)')
tournament_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,index,Conference
0,A&M-Corpus Christi (Southland),44,141,343,156,90,349,100,51,200,...,26,56,190,266,44,296,75,0.0,(Southland),Southland
1,Abilene Christian (WAC),52,76,338,194,204,350,52,284,126,...,2,186,262,229,1,178,60,,(WAC),WAC
2,Air Force (Mountain West),206,265,190,203,318,163,339,342,78,...,284,57,210,195,235,272,259,,(Mountain West),Mountain West
3,Akron (MAC),278,177,162,70,119,192,272,64,22,...,251,228,78,131,162,63,51,0.0,(MAC),MAC
4,Alabama A&M (SWAC),347,346,164,349,27,73,179,266,107,...,113,93,348,349,180,314,255,,(SWAC),SWAC


In [58]:
# delete conference name out of Team Column: (\s\(\w*?\s*-*\w*?\)$)
tournament_teams_df['Team'] = tournament_teams_df['Team'].str.replace((r'(\s\(\w*?\s*-*\w*?\)$)'), "")
tournament_teams_df.head()

  


Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,index,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,26,56,190,266,44,296,75,0.0,(Southland),Southland
1,Abilene Christian,52,76,338,194,204,350,52,284,126,...,2,186,262,229,1,178,60,,(WAC),WAC
2,Air Force,206,265,190,203,318,163,339,342,78,...,284,57,210,195,235,272,259,,(Mountain West),Mountain West
3,Akron,278,177,162,70,119,192,272,64,22,...,251,228,78,131,162,63,51,0.0,(MAC),MAC
4,Alabama A&M,347,346,164,349,27,73,179,266,107,...,113,93,348,349,180,314,255,,(SWAC),SWAC


In [59]:
tournament_teams_df = tournament_teams_df.drop(columns = ["index"])
tournament_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,45,26,56,190,266,44,296,75,0.0,Southland
1,Abilene Christian,52,76,338,194,204,350,52,284,126,...,44,2,186,262,229,1,178,60,,WAC
2,Air Force,206,265,190,203,318,163,339,342,78,...,347,284,57,210,195,235,272,259,,Mountain West
3,Akron,278,177,162,70,119,192,272,64,22,...,187,251,228,78,131,162,63,51,0.0,MAC
4,Alabama A&M,347,346,164,349,27,73,179,266,107,...,343,113,93,348,349,180,314,255,,SWAC


In [60]:
# teams_path = Path("../Module 20 NCAA stats/tournament_teams.csv")
# tournament_teams_df.to_csv(teams_path, index = False)  

In [61]:
mm_teams_df = pd.read_csv("../Module 20 NCAA stats/tournament_teams.csv")
mm_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,45,26,56,190,266,44,296,75,0.0,Southland
1,Abilene Christian,52,76,338,194,204,350,52,284,126,...,44,2,186,262,229,1,178,60,,WAC
2,Air Force,206,265,190,203,318,163,339,342,78,...,347,284,57,210,195,235,272,259,,Mountain West
3,Akron,278,177,162,70,119,192,272,64,22,...,187,251,228,78,131,162,63,51,0.0,MAC
4,Alabama A&M,347,346,164,349,27,73,179,266,107,...,343,113,93,348,349,180,314,255,,SWAC


In [62]:
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        3794 non-null   object
 1   Assists Per. Game Rank      3794 non-null   object
 2   Assist Turnover Ratio Rank  3794 non-null   object
 3   BlocksPerGame Rank          3794 non-null   object
 4   Field Goal % Rank           3794 non-null   object
 5   Field Goal % Defense Rank   3794 non-null   object
 6   Fouls Per Game Rank         3794 non-null   object
 7   Free Throw % Rank           3794 non-null   object
 8   Rebound Margin Rank         3794 non-null   object
 9   Scoring Defense Rank        3794 non-null   object
 10  Scoring Margin Rank         3794 non-null   object
 11  Scoring Offense Rank        3794 non-null   object
 12  Steals Per Game Rank        3794 non-null   object
 13  Three Pt FG Defense Rank    3794 non-null   obje

In [63]:
 column_names = list(mm_teams_df.columns.values.tolist())

In [64]:
column_names

['Team',
 'Assists Per. Game Rank',
 'Assist Turnover Ratio Rank',
 'BlocksPerGame Rank',
 'Field Goal % Rank',
 'Field Goal % Defense Rank',
 'Fouls Per Game Rank',
 'Free Throw % Rank',
 'Rebound Margin Rank',
 'Scoring Defense Rank',
 'Scoring Margin Rank',
 'Scoring Offense Rank',
 'Steals Per Game Rank',
 'Three Pt FG Defense Rank',
 'Three Pt FG % Rank',
 'Three Pt FG Per Game Rank',
 'Turnover Margin Rank',
 'Turnover Per Game Rank',
 'Win-Loss Rank',
 'Tournament Wins',
 'Conference']

In [65]:
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3805 entries, 0 to 3804
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        3794 non-null   object
 1   Assists Per. Game Rank      3794 non-null   object
 2   Assist Turnover Ratio Rank  3794 non-null   object
 3   BlocksPerGame Rank          3794 non-null   object
 4   Field Goal % Rank           3794 non-null   object
 5   Field Goal % Defense Rank   3794 non-null   object
 6   Fouls Per Game Rank         3794 non-null   object
 7   Free Throw % Rank           3794 non-null   object
 8   Rebound Margin Rank         3794 non-null   object
 9   Scoring Defense Rank        3794 non-null   object
 10  Scoring Margin Rank         3794 non-null   object
 11  Scoring Offense Rank        3794 non-null   object
 12  Steals Per Game Rank        3794 non-null   object
 13  Three Pt FG Defense Rank    3794 non-null   obje

In [66]:
mm_teams_df = mm_teams_df.dropna(subset=['Tournament Wins'])
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 3795
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        675 non-null    object
 1   Assists Per. Game Rank      675 non-null    object
 2   Assist Turnover Ratio Rank  675 non-null    object
 3   BlocksPerGame Rank          675 non-null    object
 4   Field Goal % Rank           675 non-null    object
 5   Field Goal % Defense Rank   675 non-null    object
 6   Fouls Per Game Rank         675 non-null    object
 7   Free Throw % Rank           675 non-null    object
 8   Rebound Margin Rank         675 non-null    object
 9   Scoring Defense Rank        675 non-null    object
 10  Scoring Margin Rank         675 non-null    object
 11  Scoring Offense Rank        675 non-null    object
 12  Steals Per Game Rank        675 non-null    object
 13  Three Pt FG Defense Rank    675 non-null    objec

In [67]:
mm_teams_df["Turnover Margin Rank"] = mm_teams_df["Turnover Margin Rank"].fillna(0)
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 3795
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        675 non-null    object
 1   Assists Per. Game Rank      675 non-null    object
 2   Assist Turnover Ratio Rank  675 non-null    object
 3   BlocksPerGame Rank          675 non-null    object
 4   Field Goal % Rank           675 non-null    object
 5   Field Goal % Defense Rank   675 non-null    object
 6   Fouls Per Game Rank         675 non-null    object
 7   Free Throw % Rank           675 non-null    object
 8   Rebound Margin Rank         675 non-null    object
 9   Scoring Defense Rank        675 non-null    object
 10  Scoring Margin Rank         675 non-null    object
 11  Scoring Offense Rank        675 non-null    object
 12  Steals Per Game Rank        675 non-null    object
 13  Three Pt FG Defense Rank    675 non-null    objec

In [68]:
mm_teams_df["Conference"] = mm_teams_df["Conference"].fillna("None")
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 3795
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        675 non-null    object
 1   Assists Per. Game Rank      675 non-null    object
 2   Assist Turnover Ratio Rank  675 non-null    object
 3   BlocksPerGame Rank          675 non-null    object
 4   Field Goal % Rank           675 non-null    object
 5   Field Goal % Defense Rank   675 non-null    object
 6   Fouls Per Game Rank         675 non-null    object
 7   Free Throw % Rank           675 non-null    object
 8   Rebound Margin Rank         675 non-null    object
 9   Scoring Defense Rank        675 non-null    object
 10  Scoring Margin Rank         675 non-null    object
 11  Scoring Offense Rank        675 non-null    object
 12  Steals Per Game Rank        675 non-null    object
 13  Three Pt FG Defense Rank    675 non-null    objec

In [69]:
column_names

['Team',
 'Assists Per. Game Rank',
 'Assist Turnover Ratio Rank',
 'BlocksPerGame Rank',
 'Field Goal % Rank',
 'Field Goal % Defense Rank',
 'Fouls Per Game Rank',
 'Free Throw % Rank',
 'Rebound Margin Rank',
 'Scoring Defense Rank',
 'Scoring Margin Rank',
 'Scoring Offense Rank',
 'Steals Per Game Rank',
 'Three Pt FG Defense Rank',
 'Three Pt FG % Rank',
 'Three Pt FG Per Game Rank',
 'Turnover Margin Rank',
 'Turnover Per Game Rank',
 'Win-Loss Rank',
 'Tournament Wins',
 'Conference']

In [70]:
columns_to_int = ['Assists Per. Game Rank',
 'Assist Turnover Ratio Rank',
 'BlocksPerGame Rank',
 'Field Goal % Rank',
 'Field Goal % Defense Rank',
 'Fouls Per Game Rank',
 'Free Throw % Rank',
 'Rebound Margin Rank',
 'Scoring Defense Rank',
 'Scoring Margin Rank',
 'Scoring Offense Rank',
 'Steals Per Game Rank',
 'Three Pt FG Defense Rank',
 'Three Pt FG % Rank',
 'Three Pt FG Per Game Rank',
 'Turnover Margin Rank',
 'Turnover Per Game Rank',
 'Win-Loss Rank',
 'Tournament Wins']


In [71]:
columns_to_int

['Assists Per. Game Rank',
 'Assist Turnover Ratio Rank',
 'BlocksPerGame Rank',
 'Field Goal % Rank',
 'Field Goal % Defense Rank',
 'Fouls Per Game Rank',
 'Free Throw % Rank',
 'Rebound Margin Rank',
 'Scoring Defense Rank',
 'Scoring Margin Rank',
 'Scoring Offense Rank',
 'Steals Per Game Rank',
 'Three Pt FG Defense Rank',
 'Three Pt FG % Rank',
 'Three Pt FG Per Game Rank',
 'Turnover Margin Rank',
 'Turnover Per Game Rank',
 'Win-Loss Rank',
 'Tournament Wins']

In [47]:
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 675 entries, 0 to 3795
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        675 non-null    object
 1   Assists Per. Game Rank      675 non-null    object
 2   Assist Turnover Ratio Rank  675 non-null    object
 3   BlocksPerGame Rank          675 non-null    object
 4   Field Goal % Rank           675 non-null    object
 5   Field Goal % Defense Rank   675 non-null    object
 6   Fouls Per Game Rank         675 non-null    object
 7   Free Throw % Rank           675 non-null    object
 8   Rebound Margin Rank         675 non-null    object
 9   Scoring Defense Rank        675 non-null    object
 10  Scoring Margin Rank         675 non-null    object
 11  Scoring Offense Rank        675 non-null    object
 12  Steals Per Game Rank        675 non-null    object
 13  Three Pt FG Defense Rank    675 non-null    objec

In [72]:
# mm_teams_df_path = Path("../Module 20 NCAA stats/mm_teams.csv")
# mm_teams_df.to_csv(mm_teams_df_path, index = False)  

In [73]:
mm_teams_df = pd.read_csv("../Module 20 NCAA stats/mm_teams.csv")
mm_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,45,26,56,190,266,44,296,75,0,Southland
1,Akron,278,177,162,70,119,192,272,64,22,...,187,251,228,78,131,162,63,51,0,MAC
2,Alabama,66,185,36,184,200,304,132,76,320,...,13,116,199,300,27,275,339,142,0,SEC
3,Arizona,1,17,8,4,10,319,95,9,137,...,3,147,139,95,139,232,110,2,2,Pac-12
4,Arkansas,116,110,71,218,58,323,32,89,145,...,50,72,125,320,279,40,287,36,3,SEC


In [74]:
mm_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674 entries, 0 to 673
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Team                        674 non-null    object
 1   Assists Per. Game Rank      674 non-null    int64 
 2   Assist Turnover Ratio Rank  674 non-null    int64 
 3   BlocksPerGame Rank          674 non-null    int64 
 4   Field Goal % Rank           674 non-null    int64 
 5   Field Goal % Defense Rank   674 non-null    int64 
 6   Fouls Per Game Rank         674 non-null    int64 
 7   Free Throw % Rank           674 non-null    int64 
 8   Rebound Margin Rank         674 non-null    int64 
 9   Scoring Defense Rank        674 non-null    int64 
 10  Scoring Margin Rank         674 non-null    int64 
 11  Scoring Offense Rank        674 non-null    int64 
 12  Steals Per Game Rank        674 non-null    int64 
 13  Three Pt FG Defense Rank    674 non-null    int64 

In [75]:
# mm_teams_df['Assists Per. Game Rank'] = mm_teams_df['Assists Per. Game Rank'].astype("int64")
# mm_teams_df.info()

In [76]:
# mm_teams_df_path = Path("../Module 20 NCAA stats/mm_teams.csv")
# mm_teams_df.to_csv(mm_teams_df_path, index = False)  

In [77]:
# mm_teams_df = pd.read_csv("../Module 20 NCAA stats/mm_teams.csv")
# mm_teams_df.info()

In [78]:
mm_teams_df.head()

Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Scoring Offense Rank,Steals Per Game Rank,Three Pt FG Defense Rank,Three Pt FG % Rank,Three Pt FG Per Game Rank,Turnover Margin Rank,Turnover Per Game Rank,Win-Loss Rank,Tournament Wins,Conference
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,45,26,56,190,266,44,296,75,0,Southland
1,Akron,278,177,162,70,119,192,272,64,22,...,187,251,228,78,131,162,63,51,0,MAC
2,Alabama,66,185,36,184,200,304,132,76,320,...,13,116,199,300,27,275,339,142,0,SEC
3,Arizona,1,17,8,4,10,319,95,9,137,...,3,147,139,95,139,232,110,2,2,Pac-12
4,Arkansas,116,110,71,218,58,323,32,89,145,...,50,72,125,320,279,40,287,36,3,SEC


### re-run randomforestclassifer model on new data

In [79]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mm_teams_df["Conference"].values.reshape(-1,1)))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(["Conference"])
encode_df.head()

Unnamed: 0,Conference_AAC,Conference_ACC,Conference_ASUN,Conference_America East,Conference_Atlantic 10,Conference_Big 12,Conference_Big East,Conference_Big Sky,Conference_Big South,Conference_Big Ten,...,Conference_Pac-12,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
# Merge the two DataFrames together and drop the Conference column
mm_teams_merged_df = mm_teams_df.merge(encode_df,left_index=True,right_index=True).drop("Conference",1)
mm_teams_merged_df.head()

  


Unnamed: 0,Team,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,...,Conference_Pac-12,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC
0,A&M-Corpus Christi,44,141,343,156,90,349,100,51,200,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Akron,278,177,162,70,119,192,272,64,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alabama,66,185,36,184,200,304,132,76,320,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arizona,1,17,8,4,10,319,95,9,137,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arkansas,116,110,71,218,58,323,32,89,145,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
# Define the features set.
X = mm_teams_merged_df.drop(columns= ["Team", "Tournament Wins"])
X.head()

Unnamed: 0,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,...,Conference_Pac-12,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC
0,44,141,343,156,90,349,100,51,200,77,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,278,177,162,70,119,192,272,64,22,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,66,185,36,184,200,304,132,76,320,150,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,17,8,4,10,319,95,9,137,3,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,110,71,218,58,323,32,89,145,53,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674 entries, 0 to 673
Data columns (total 51 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Assists Per. Game Rank      674 non-null    int64  
 1   Assist Turnover Ratio Rank  674 non-null    int64  
 2   BlocksPerGame Rank          674 non-null    int64  
 3   Field Goal % Rank           674 non-null    int64  
 4   Field Goal % Defense Rank   674 non-null    int64  
 5   Fouls Per Game Rank         674 non-null    int64  
 6   Free Throw % Rank           674 non-null    int64  
 7   Rebound Margin Rank         674 non-null    int64  
 8   Scoring Defense Rank        674 non-null    int64  
 9   Scoring Margin Rank         674 non-null    int64  
 10  Scoring Offense Rank        674 non-null    int64  
 11  Steals Per Game Rank        674 non-null    int64  
 12  Three Pt FG Defense Rank    674 non-null    int64  
 13  Three Pt FG % Rank          674 non

In [83]:
# Define the target set.
y = mm_teams_merged_df["Tournament Wins"]
y.head()

0    0
1    0
2    0
3    2
4    3
Name: Tournament Wins, dtype: int64

In [84]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [85]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [86]:
X_train_scaled

array([[ 0.32275192, -1.05459901, -0.46796667, ..., -0.10965422,
        -0.11855895, -0.17496355],
       [ 0.70878338, -0.54908419, -1.30618511, ..., -0.10965422,
        -0.11855895, -0.17496355],
       [ 1.08275135,  1.0812011 , -0.88163292, ..., -0.10965422,
        -0.11855895, -0.17496355],
       ...,
       [ 0.56402158,  0.6515135 ,  0.88189159, ..., -0.10965422,
        -0.11855895, -0.17496355],
       [-0.12359695, -0.82711734,  2.35149535, ..., -0.10965422,
        -0.11855895, -0.17496355],
       [-0.67851718, -0.67546289,  1.16492639, ..., -0.10965422,
        -0.11855895, -0.17496355]])

In [87]:
# different step
# Create a random forest classifier. keep n_estimators between 64-128; 
# changing to 500 changes results very little
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [88]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [89]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [90]:
X_test['Prediction'] = predictions
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Assists Per. Game Rank,Assist Turnover Ratio Rank,BlocksPerGame Rank,Field Goal % Rank,Field Goal % Defense Rank,Fouls Per Game Rank,Free Throw % Rank,Rebound Margin Rank,Scoring Defense Rank,Scoring Margin Rank,...,Conference_Patriot,Conference_SEC,Conference_SWAC,Conference_SoCon,Conference_Southland,Conference_Summit League,Conference_Sun Belt,Conference_WAC,Conference_WCC,Prediction
543,322,255,126,301,23,118,59,174,55,161,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
350,303,106,177,162,202,156,133,284,300,102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
273,100,67,22,141,172,229,162,231,104,23,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
662,293,322,60,304,18,284,278,239,53,179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
56,125,181,126,33,8,294,236,24,11,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [91]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
# confusion_matrix(y_test, predictions).ravel()
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4", "Actual 5", "Actual 6"], columns=["Predicted 0", "Predicted 1", "Predicted 2","Predicted 3", "Predicted 4","Predicted 5", "Predicted 6"])

cm_df
# cm
# true_negative, false_positive, false_negative, true_positive

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 0,63,9,1,0,0,0,0
Actual 1,43,12,2,0,0,0,0
Actual 2,7,11,3,0,0,0,0
Actual 3,6,1,2,0,0,0,0
Actual 4,2,0,1,0,0,0,0
Actual 5,1,1,1,0,0,0,0
Actual 6,0,2,0,1,0,0,0


In [92]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [93]:
# Displaying results
print("Confusion Matrix")
# display(cm_df)
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


array([[63,  9,  1,  0,  0,  0,  0],
       [43, 12,  2,  0,  0,  0,  0],
       [ 7, 11,  3,  0,  0,  0,  0],
       [ 6,  1,  2,  0,  0,  0,  0],
       [ 2,  0,  1,  0,  0,  0,  0],
       [ 1,  1,  1,  0,  0,  0,  0],
       [ 0,  2,  0,  1,  0,  0,  0]], dtype=int64)

Accuracy Score : 0.46153846153846156
Classification Report
              precision    recall  f1-score   support

           0       0.52      0.86      0.65        73
           1       0.33      0.21      0.26        57
           2       0.30      0.14      0.19        21
           3       0.00      0.00      0.00         9
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         3

    accuracy                           0.46       169
   macro avg       0.16      0.17      0.16       169
weighted avg       0.37      0.46      0.39       169



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
# new step
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.0436018 , 0.05561156, 0.05132991, 0.05026642, 0.04637168,
       0.05332925, 0.04462831, 0.05113935, 0.0448197 , 0.05474888,
       0.0473323 , 0.04879095, 0.05052728, 0.04616512, 0.05304745,
       0.05027975, 0.04923118, 0.06306396, 0.00574303, 0.01195205,
       0.00146061, 0.00075314, 0.00424637, 0.00878348, 0.00468799,
       0.00171006, 0.00120446, 0.01328531, 0.00099745, 0.00145812,
       0.00159548, 0.00182162, 0.00064058, 0.00104431, 0.00074837,
       0.00185547, 0.00271879, 0.00317637, 0.0008142 , 0.00013401,
       0.00078399, 0.00970109, 0.00113716, 0.00746748, 0.00012513,
       0.00093373, 0.00039563, 0.00076509, 0.00080937, 0.00115841,
       0.0016068 ])

In [95]:
# new step
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.06306395785195607, 'Win-Loss Rank'),
 (0.055611564558302656, 'Assist Turnover Ratio Rank'),
 (0.05474888028778957, 'Scoring Margin Rank'),
 (0.05332924851303204, 'Fouls Per Game Rank'),
 (0.05304745024427973, 'Three Pt FG Per Game Rank'),
 (0.051329914873656825, 'BlocksPerGame Rank'),
 (0.05113935096824249, 'Rebound Margin Rank'),
 (0.050527277783366964, 'Three Pt FG Defense Rank'),
 (0.05027975369296144, 'Turnover Margin Rank'),
 (0.05026641626976812, 'Field Goal % Rank'),
 (0.04923118046500145, 'Turnover Per Game Rank'),
 (0.04879094935638612, 'Steals Per Game Rank'),
 (0.04733229972101163, 'Scoring Offense Rank'),
 (0.04637167828296743, 'Field Goal % Defense Rank'),
 (0.046165121654881826, 'Three Pt FG % Rank'),
 (0.04481970312424682, 'Scoring Defense Rank'),
 (0.044628310526441554, 'Free Throw % Rank'),
 (0.04360179525092382, 'Assists Per. Game Rank'),
 (0.013285305956156039, 'Conference_Big Ten'),
 (0.011952051672598185, 'Conference_ACC'),
 (0.00970109164833636, 'Conference_Pa

### random oversampling

In [96]:
from collections import Counter

In [97]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({1: 219, 4: 219, 0: 219, 2: 219, 6: 219, 3: 219, 5: 219})

In [98]:
# Logistic regression using random oversampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [99]:
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[36, 16, 10,  6,  5,  0,  0],
       [17, 13,  8,  7,  6,  4,  2],
       [ 2,  2,  5,  4,  5,  1,  2],
       [ 1,  1,  4,  1,  2,  0,  0],
       [ 0,  0,  1,  0,  2,  0,  0],
       [ 0,  0,  0,  1,  1,  1,  0],
       [ 0,  0,  1,  0,  0,  1,  1]], dtype=int64)

In [100]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.3433943632728266

In [101]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.64      0.49      0.79      0.56      0.62      0.38        73
          1       0.41      0.23      0.83      0.29      0.44      0.18        57
          2       0.17      0.24      0.84      0.20      0.45      0.19        21
          3       0.05      0.11      0.89      0.07      0.31      0.09         9
          4       0.10      0.67      0.89      0.17      0.77      0.58         3
          5       0.14      0.33      0.96      0.20      0.57      0.30         3
          6       0.20      0.33      0.98      0.25      0.57      0.30         3

avg / total       0.45      0.35      0.82      0.38      0.52      0.27       169



### SMOTE

In [102]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train_scaled, y_train
)
Counter(y_resampled)

Counter({1: 219, 4: 219, 0: 219, 2: 219, 6: 219, 3: 219, 5: 219})

In [103]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [104]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.24581675977288292

In [105]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[37, 19, 14,  2,  1,  0,  0],
       [18, 14, 11,  2,  6,  4,  2],
       [ 2,  2,  4,  5,  5,  1,  2],
       [ 1,  1,  4,  1,  2,  0,  0],
       [ 0,  1,  1,  0,  1,  0,  0],
       [ 0,  0,  0,  2,  1,  0,  0],
       [ 0,  0,  0,  0,  1,  1,  1]], dtype=int64)

In [106]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.64      0.51      0.78      0.56      0.63      0.39        73
          1       0.38      0.25      0.79      0.30      0.44      0.18        57
          2       0.12      0.19      0.80      0.15      0.39      0.14        21
          3       0.08      0.11      0.93      0.10      0.32      0.09         9
          4       0.06      0.33      0.90      0.10      0.55      0.28         3
          5       0.00      0.00      0.96      0.00      0.00      0.00         3
          6       0.20      0.33      0.98      0.25      0.57      0.30         3

avg / total       0.43      0.34      0.80      0.37      0.51      0.26       169



### Undersampling

In [107]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 8, 1: 8, 2: 8, 3: 8, 4: 8, 5: 8, 6: 8})

In [108]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [109]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[10,  4,  7,  8, 19,  9, 16],
       [ 3,  7,  3,  7, 20,  3, 14],
       [ 1,  1,  3,  1,  5,  0, 10],
       [ 0,  1,  0,  0,  3,  1,  4],
       [ 0,  0,  0,  0,  1,  0,  2],
       [ 0,  0,  0,  0,  1,  0,  2],
       [ 0,  0,  1,  0,  1,  0,  1]], dtype=int64)

In [110]:
# Calculate the Balanced Accuracy Score
# from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.1527595897767903

In [111]:
# Print the imbalanced classification report
# from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.71      0.14      0.96      0.23      0.36      0.12        73
          1       0.54      0.12      0.95      0.20      0.34      0.11        57
          2       0.21      0.14      0.93      0.17      0.36      0.12        21
          3       0.00      0.00      0.90      0.00      0.00      0.00         9
          4       0.02      0.33      0.70      0.04      0.48      0.23         3
          5       0.00      0.00      0.92      0.00      0.00      0.00         3
          6       0.02      0.33      0.71      0.04      0.49      0.23         3

avg / total       0.52      0.13      0.94      0.19      0.33      0.11       169



### Cluster Centroid Undersampling

In [112]:
# Fit the data using `ClusterCentroids` and check the count of each class
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 8, 1: 8, 2: 8, 3: 8, 4: 8, 5: 8, 6: 8})

In [113]:
# Logistic regression using cluster centroid undersampled data
# from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=78)

In [114]:
# Display the confusion matrix
# from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[ 6,  1,  3, 23, 14,  7, 19],
       [ 3,  1,  1,  8, 20,  6, 18],
       [ 0,  0,  0,  2,  6,  2, 11],
       [ 0,  0,  0,  1,  4,  0,  4],
       [ 0,  0,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  1,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  3]], dtype=int64)

In [115]:
# Calculate the balanced accuracy score
# from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.1729781073688788

In [116]:
# Print the imbalanced classification report
# from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      0.08      0.97      0.15      0.28      0.07        73
          1       0.50      0.02      0.99      0.03      0.13      0.02        57
          2       0.00      0.00      0.97      0.00      0.00      0.00        21
          3       0.03      0.11      0.79      0.05      0.30      0.08         9
          4       0.00      0.00      0.73      0.00      0.00      0.00         3
          5       0.00      0.00      0.91      0.00      0.00      0.00         3
          6       0.05      1.00      0.66      0.10      0.81      0.68         3

avg / total       0.46      0.07      0.96      0.08      0.20      0.05       169

