In [63]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from xgboost import XGBClassifier

In [64]:
# Load dataset
data = pd.read_excel('Resources/Dataset.xlsx', engine='openpyxl')
data.head()

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,GSW,GSW vs. PHX,10/24/2023,L,240,104,36,101,35.6,10,...,78.6,18,31,49,19,11,6,11,23,-4
1,PHX,PHX @ GSW,10/24/2023,W,240,108,42,95,44.2,11,...,76.5,17,43,60,23,5,7,19,22,4
2,LAL,LAL @ DEN,10/24/2023,L,240,107,41,90,45.6,10,...,75.0,13,31,44,23,5,4,12,18,-12
3,DEN,DEN vs. LAL,10/24/2023,W,240,119,48,91,52.7,14,...,75.0,9,33,42,29,9,6,12,15,12
4,MEM,MEM vs. NOP,10/25/2023,L,240,104,38,91,41.8,12,...,80.0,8,29,37,23,8,7,13,19,-7


Manipulate Data:

In [65]:
# Convert game date to datetime
data['Game Date'] = pd.to_datetime(data['Game Date'], format='%m/%d/%Y')

# Extract home and guest teams
data['Home Team'] = data['Match Up'].apply(lambda x: x.split(' vs. ')[0] if 'vs.' in x else x.split(' @ ')[1])
data['Guest Team'] = data['Match Up'].apply(lambda x: x.split(' vs. ')[1] if 'vs.' in x else x.split(' @ ')[0])

# Add the correct Label column: 1 if home team won, 0 otherwise
data['Label'] = data.apply(
    lambda row: 1 if ((row['Team'] == row['Home Team']) and (row['W/L'] == 'W')) 
                    or ((row['Team'] == row['Guest Team']) and (row['W/L'] == 'L'))
                else 0,
    axis=1
)
data.head()

Unnamed: 0,Team,Match Up,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,REB,AST,STL,BLK,TOV,PF,+/-,Home Team,Guest Team,Label
0,GSW,GSW vs. PHX,2023-10-24,L,240,104,36,101,35.6,10,...,49,19,11,6,11,23,-4,GSW,PHX,0
1,PHX,PHX @ GSW,2023-10-24,W,240,108,42,95,44.2,11,...,60,23,5,7,19,22,4,GSW,PHX,0
2,LAL,LAL @ DEN,2023-10-24,L,240,107,41,90,45.6,10,...,44,23,5,4,12,18,-12,DEN,LAL,1
3,DEN,DEN vs. LAL,2023-10-24,W,240,119,48,91,52.7,14,...,42,29,9,6,12,15,12,DEN,LAL,1
4,MEM,MEM vs. NOP,2023-10-25,L,240,104,38,91,41.8,12,...,37,23,8,7,13,19,-7,MEM,NOP,0


Calculating bias for later adding Home Advatange feature:

But don't use in the end because we've already have half of the data to make home advantage's information is in the model.

In [66]:
# Create a binary feature for home games
data['Is_Home'] = data['Match Up'].str.contains(' vs. ').astype(int)

# Prepare features and labels
X = data[['Is_Home']]  # Home/Away indicator
y = (data['W/L'] == 'W').astype(int)  # Convert W/L to binary outcome

# Fit a linear regression model
model = LinearRegression()
model.fit(X, y)

# The coefficient of 'Is_Home' represents the home advantage bias
bias = model.coef_[0]
print(f"Calculated Bias (Regression): {bias}")
new_bias = 1/21 * bias

# Remove the 'Is_Home' column
data = data.drop(columns=['Is_Home'])

Calculated Bias (Regression): 0.08617886178861786


This df below only contains stability features. Here is how this feature were computed:

stability score of the feature = mean performance of the feature / variance of the feature

stability score of the team = average of the stability score for each features

difference in average stability score (what we finally used) = stability score of home team - stability score of guest team

In [67]:
# dataset with new feature: stability score
# Initialize the new dataset
new_dataset = []

# Iterate over each match-up
for _, row in data.iterrows():
    game_date = row['Game Date']
    home_team = row['Home Team']
    guest_team = row['Guest Team']
    
    # Filter games before the current game date for both teams
    home_team_games = data[(data['Team'] == home_team) & (data['Game Date'] < game_date)]
    guest_team_games = data[(data['Team'] == guest_team) & (data['Game Date'] < game_date)]
    
    # Calculate W/L rates (win rates) for both teams
    home_team_wins = (home_team_games['W/L'] == 'W').sum()
    home_team_total = len(home_team_games)
    guest_team_wins = (guest_team_games['W/L'] == 'W').sum()
    guest_team_total = len(guest_team_games)
    
    home_win_rate = home_team_wins / home_team_total if home_team_total > 0 else 0
    guest_win_rate = guest_team_wins / guest_team_total if guest_team_total > 0 else 0
    
    # Calculate W/L difference
    wl_difference = home_win_rate - guest_win_rate
    
    # Calculate Stability using mean and variance
    if not home_team_games.empty:
        home_mean_performance = home_team_games.iloc[:, 5:].mean(numeric_only=True)
        home_variance_performance = home_team_games.iloc[:, 5:].var(numeric_only=True)
        home_stability = (home_mean_performance / (home_variance_performance + 1e-6)).mean()
        home_stability = min(max(home_stability, -100), 100)
    else:
        home_stability = 0

    if not guest_team_games.empty:
        guest_mean_performance = guest_team_games.iloc[:, 5:].mean(numeric_only=True)
        guest_variance_performance = guest_team_games.iloc[:, 5:].var(numeric_only=True)
        guest_stability = (guest_mean_performance / (guest_variance_performance + 1e-6)).mean()
        guest_stability = min(max(guest_stability, -100), 100)  # Limit stability within [-100, 100]
    else:
        guest_stability = 0

    # Stability difference
    stability = home_stability - guest_stability
    
    # Prepare the new row
    new_row = {
        'Game Date': game_date,
        'Home Team': home_team,
        'Guest Team': guest_team,
        'Label': int(row['Label']),  # Ensure Label is directly copied and kept as integer
        'W/L Difference': wl_difference,
        'Stability': stability
    }
    
    # Add statistics differences (like in the original approach)
    home_team_stats = home_team_games.iloc[:, 4:].mean(numeric_only=True)
    guest_team_stats = guest_team_games.iloc[:, 4:].mean(numeric_only=True)
    
    if not home_team_stats.empty and not guest_team_stats.empty:
        stats_diff = home_team_stats - guest_team_stats
        # Update new_row but ensure no conflict with 'Label'
        new_row.update({k: v for k, v in stats_diff.to_dict().items() if k != 'Label'})
    
    new_dataset.append(new_row)
# Convert to DataFrame
new_dataset_df_1 = pd.DataFrame(new_dataset)

# Save to file for inspection
output_file = 'Resources/Dataset_With_Stability.xlsx'
new_dataset_df_1.to_excel(output_file, index=False)

output_file

'Resources/Dataset_With_Stability.xlsx'

This fucntion and generated df below contain stability feature, Previous Competitions and Home-Away Win Rate Difference.

Home-Away Win Rate Difference: I took the home team's past home win percentage minus the away team's past away win percentage;

Previous Competitions: Just like the discreption in guideline.

In [68]:
def standardize_match(row):
    teams = sorted([row['Home Team'], row['Guest Team']])
    standardized_row = {
        'Standard Home Team': teams[0],
        'Standard Guest Team': teams[1],
        'Game Date': row['Game Date'],
        'W/L': 'W' if (row['Home Team'] == teams[0] and row['W/L'] == 'W') or
                       (row['Guest Team'] == teams[0] and row['W/L'] == 'L') else 'L'
    }
    return standardized_row

This code below doesn't work. I try to add some weights when I calculating the average statistics, but it failed... You may try to revise it?

Always run the below chunck to make sure get a clean dataset (without Na and duplicates).

In [69]:
# Remove rows with NaN values and drop duplicate rows
dataset_df_1 = new_dataset_df_1.dropna().drop_duplicates()

# Save to file for inspection
output_file = 'Resources/cleaned_final_dataset_with_features.xlsx'
dataset_df_1.to_excel(output_file, index=False)
output_file

'Resources/cleaned_final_dataset_with_features.xlsx'

In [70]:
X = dataset_df_1.drop(columns=['Label', 'Game Date', 'Home Team', 'Guest Team']) # Features: all columns except 'Label'
y = dataset_df_1['Label']  # Labels: the 'Label' column
X = X[100:]
y = y[100:] # remove first 100 rows to make sure the stability are a lot 0.
X.head()

Unnamed: 0,W/L Difference,Stability,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,...,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
265,0.133333,-0.322542,5.555556,-1.011111,-0.5,2.577778,-1.792222,0.088889,1.611111,-1.321111,...,0.933333,-0.988889,-4.322222,-5.311111,-4.844444,3.622222,-1.166667,-5.811111,-6.111111,-1.2
266,0.208333,0.206359,0.0,-7.375,-5.319444,-8.291667,-1.261111,-4.902778,-10.902778,-2.429167,...,9.222222,0.875,3.013889,3.888889,-5.291667,1.930556,1.444444,0.791667,-0.402778,6.805556
267,-0.263889,0.06608,3.125,-4.5,-4.055556,-2.777778,-3.176389,0.638889,0.652778,1.0875,...,3.277778,-1.319444,-0.666667,-1.986111,-4.0,-1.125,-3.069444,-0.319444,3.861111,-4.5
268,-0.333333,-0.753842,0.0,-15.333333,-3.777778,-0.777778,-3.622222,-6.444444,-10.333333,-6.777778,...,-1.888889,1.0,1.0,2.0,-3.0,0.777778,0.222222,1.111111,1.777778,-12.444444
269,-0.15,-0.535714,-3.125,2.025,-0.925,3.95,-3.44,2.425,8.2,-1.415,...,1.325,3.775,-4.7,-0.925,2.55,0.025,-2.025,-1.625,1.65,-6.625


Using data standardization and L1 regularization to complete Feature Selection.

In [71]:
# Step 1: Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)  # Standardize X to have zero mean and unit variance

# Step 2: Fit Logistic Regression with L1 regularization for binary model
lasso_log_reg = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)  # L1 penalty for feature selection
lasso_log_reg.fit(X_standardized, y)

# Step 3: Identify selected features
selected_features = X.columns[(lasso_log_reg.coef_ != 0).flatten()]  # Keep only features with non-zero coefficients
X_selected = X[selected_features]  # Subset original X with selected features

print(f"Selected features: {list(selected_features)}")
print(X_selected.head())

Selected features: ['W/L Difference', 'Stability', 'MIN', 'FGA', 'FG%', '3PA', 'FTM', 'FTA', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'PF', '+/-']
     W/L Difference  Stability       MIN       FGA       FG%        3PA  \
265        0.133333  -0.322542  5.555556  2.577778 -1.792222   1.611111   
266        0.208333   0.206359  0.000000 -8.291667 -1.261111 -10.902778   
267       -0.263889   0.066080  3.125000 -2.777778 -3.176389   0.652778   
268       -0.333333  -0.753842  0.000000 -0.777778 -3.622222 -10.333333   
269       -0.150000  -0.535714 -3.125000  3.950000 -3.440000   8.200000   

          FTM       FTA      DREB       REB       AST       STL       BLK  \
265 -0.100000  0.933333 -4.322222 -5.311111 -4.844444  3.622222 -1.166667   
266  8.166667  9.222222  3.013889  3.888889 -5.291667  1.930556  1.444444   
267  2.972222  3.277778 -0.666667 -1.986111 -4.000000 -1.125000 -3.069444   
268 -1.333333 -1.888889  1.000000  2.000000 -3.000000  0.777778  0.222222   
269  1.450000  1.32500

Training & Testing using 5-fold cv (including Random Forest, Logistic Regression, Decision Tree, AdaBoost, and QDA):

The accuracy of logistic regression is the highest which is about 0.66 for either 1 new feature or 3 new features. So we imporved 0.2 in total...

In [72]:
from sklearn.svm import SVC

models = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'Logistic Regression': LogisticRegression(random_state=0, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'AdaBoost': AdaBoostClassifier(random_state=0),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=0),
    'SVM (Linear)': SVC(kernel='linear', C=1, random_state=0),
    'SVM (RBF)': SVC(kernel='rbf', C=1, gamma='scale', random_state=0)
}

In [75]:
# Ensure data is shuffled
X_shuffled, y_shuffled = shuffle(X_selected, y, random_state=0)

# Define K-Fold
kf = KFold(n_splits=12, shuffle=True, random_state=0)

# Initialize a dictionary to store accuracies
model_accuracies = {model_name: [] for model_name in models}

# Perform K-Fold Cross-Validation
for train_index, test_index in kf.split(X_shuffled):
    X_train, X_test = X_shuffled.iloc[train_index], X_shuffled.iloc[test_index]
    y_train, y_test = y_shuffled.iloc[train_index], y_shuffled.iloc[test_index]
    
    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        # Predict on the test set
        y_pred = model.predict(X_test)
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        model_accuracies[model_name].append(accuracy)

# Print Mean Accuracy for Each Model
for model_name, accuracies in model_accuracies.items():
    print(f"{model_name} Accuracy for each fold: {accuracies}")
    print(f"{model_name} Mean accuracy: {sum(accuracies) / len(accuracies):.2f}")


Random Forest Accuracy for each fold: [0.6195652173913043, 0.6521739130434783, 0.6304347826086957, 0.6304347826086957, 0.5760869565217391, 0.6483516483516484, 0.7032967032967034, 0.6483516483516484, 0.6043956043956044, 0.6703296703296703, 0.6813186813186813, 0.5714285714285714]
Random Forest Mean accuracy: 0.64
Logistic Regression Accuracy for each fold: [0.6630434782608695, 0.6413043478260869, 0.6413043478260869, 0.6521739130434783, 0.6630434782608695, 0.6483516483516484, 0.6593406593406593, 0.6153846153846154, 0.6483516483516484, 0.6593406593406593, 0.6813186813186813, 0.6593406593406593]
Logistic Regression Mean accuracy: 0.65
Decision Tree Accuracy for each fold: [0.5, 0.5978260869565217, 0.5869565217391305, 0.45652173913043476, 0.5652173913043478, 0.5934065934065934, 0.6923076923076923, 0.5934065934065934, 0.5604395604395604, 0.5494505494505495, 0.5714285714285714, 0.6593406593406593]
Decision Tree Mean accuracy: 0.58
AdaBoost Accuracy for each fold: [0.6521739130434783, 0.6630434

The code below is about a neural network without using k-fold.

The accuracy of data with only stability is about 0.66.

The accuracy of data with stability, previous competitionm and Home-Away Win Rate Difference is also about 0.66.