 # Logistic Regression
 ## 1. Import data to dataframes

In [81]:

import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# import the data
current_dir = os.getcwd()
basics_dir_path = os.path.join(current_dir, '..', 'data', 'section_1_basics')
team_box_scores_dir_path = os.path.join(current_dir, '..', 'data', 'section_2_team_box_scores')
geography_dir_path = os.path.join(current_dir, '..', 'data', 'section_3_geography')
public_rankings_dir_path = os.path.join(current_dir, '..', 'data', 'section_4_public_rankings')
supplements_dir_path = os.path.join(current_dir, '..','data', 'section_5_supplements')

In [82]:
dfs = {}
for path in [basics_dir_path, team_box_scores_dir_path, geography_dir_path, public_rankings_dir_path, supplements_dir_path]:
  for filename in os.listdir(path):
    if filename.endswith(".csv"):
      filepath = os.path.join(path, filename)
      df_name = filename[:-4]  # Remove the .csv extension
      dfs[df_name] = pd.read_csv(filepath)
dfs.keys()

dict_keys(['MNCAATourneyCompactResults', 'MNCAATourneySeeds', 'MRegularSeasonCompactResults', 'MSeasons', 'MTeams', 'WNCAATourneyCompactResults', 'WNCAATourneySeeds', 'WRegularSeasonCompactResults', 'WSeasons', 'WTeams', 'MNCAATourneyDetailedResults', 'MRegularSeasonDetailedResults', 'WNCAATourneyDetailedResults', 'WRegularSeasonDetailedResults', 'Cities', 'MGameCities', 'WGameCities', 'MMasseyOrdinals', 'Conferences', 'MConferenceTourneyGames', 'MNCAATourneySeedRoundSlots', 'MNCAATourneySlots', 'MSecondaryTourneyCompactResults', 'MSecondaryTourneyTeams', 'MTeamCoaches', 'MTeamConferences', 'MTeamSpellings', 'WConferenceTourneyGames', 'WNCAATourneySlots', 'WSecondaryTourneyCompactResults', 'WSecondaryTourneyTeams', 'WTeamConferences', 'WTeamSpellings'])

 ## 2. For this model we are going to use the data from `MRegularSeasonCompactResults.csv`

In [83]:
games = dfs['MRegularSeasonCompactResults']
games.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


 ## 3. Create a dataframe containing the regular game statistics for each match of each season.

In [84]:
winning_stats = games[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
    columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'}
)
winning_stats['Win'] = 1

losing_stats = games[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
    columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'}
)
losing_stats['Win'] = 0

all_stats = pd.concat([winning_stats, losing_stats])
all_stats.head()

Unnamed: 0,Season,TeamID,PointsFor,PointsAgainst,Win
0,1985,1228,81,64,1
1,1985,1106,77,70,1
2,1985,1112,63,56,1
3,1985,1165,70,54,1
4,1985,1192,86,74,1




 #### Aggregate the data see the average points scored, average points scored against, and win percentage

In [85]:
team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
    avg_points_for=('PointsFor', 'mean'),
    avg_points_against=('PointsAgainst', 'mean'),
    win_pct=('Win', 'mean')
).reset_index()
team_stats.head()

Unnamed: 0,Season,TeamID,avg_points_for,avg_points_against,win_pct
0,1985,1102,63.083333,68.875,0.208333
1,1985,1103,61.043478,64.086957,0.391304
2,1985,1104,68.5,60.7,0.7
3,1985,1106,71.625,75.416667,0.416667
4,1985,1108,83.0,75.04,0.76


 ## 4. Create dataframe containing the matchups and merge in data from the statistics dataframe

In [86]:

matchups = games[['Season', 'WTeamID', 'LTeamID']].copy()
matchups['Team1ID'] = matchups['WTeamID']
matchups['Team2ID'] = matchups['LTeamID']
matchups['Team1Won'] = 1  # Because WTeamID is the winner in your dataset
matchups.head()

Unnamed: 0,Season,WTeamID,LTeamID,Team1ID,Team2ID,Team1Won
0,1985,1228,1328,1228,1328,1
1,1985,1106,1354,1106,1354,1
2,1985,1112,1223,1112,1223,1
3,1985,1165,1432,1165,1432,1
4,1985,1192,1447,1192,1447,1



 #### Merge in team stats for Team1

In [87]:

matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team1ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team1_avg_points_for',
    'avg_points_against': 'Team1_avg_points_against',
    'win_pct': 'Team1_win_pct'
})
matchups = matchups.drop(columns=['TeamID'])


 #### Merge in stats for Team2

In [88]:
# merge stats for Team2
matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team2ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team2_avg_points_for',
    'avg_points_against': 'Team2_avg_points_against',
    'win_pct': 'Team2_win_pct'
})
matchups = matchups.drop(columns=['TeamID'])


 #### Randomly swap Team1 and Team2 to create 2 classes for the 'Team1' column

In [89]:

np.random.seed(42)  # for reproducibility

# Create a random boolean array: True means "swap"
swap_mask = np.random.rand(len(matchups)) < 0.5

# Swap team IDs by matching the mask
matchups.loc[swap_mask, ['Team1ID', 'Team2ID']] = matchups.loc[swap_mask, ['Team2ID', 'Team1ID']].values
for feature in ['avg_points_for', 'avg_points_against', 'win_pct']:
    team1_feature = f'Team1_{feature}'
    team2_feature = f'Team2_{feature}'
    matchups.loc[swap_mask, [team1_feature, team2_feature]] = matchups.loc[swap_mask, [team2_feature, team1_feature]].values

# Set the target: 1 if original Team1 won, 0 if swapped
matchups['Team1Won'] = (~swap_mask).astype(int)


 ## 5. Build feature matrix and labels

In [90]:

feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct'
]

X = matchups[feature_cols]
y = matchups['Team1Won']


 #### Train/test split

In [91]:
# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


 ## 6. Scale features

In [92]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


 ## 7. Train the model

In [93]:
# Step 7: Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)



 ## 8. Evaluate
 This model gives an accuracy score of 74.2%.

In [94]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.3f}")

Test Set Accuracy: 0.742


 ## 9. Add margin of victory to team stats to improve the model
 The margin of victory is the difference between the points scored by the winning team and the points scored by the losing team.
 This is a common feature used in sports analytics to predict the outcome of games.
 The margin of victory is a good predictor of the outcome of a game because it takes into account the strength of both teams.
 A team that wins by a large margin is likely to be stronger than a team that wins by a small margin.

 To do this, we will add the margin of victory to the team stats dataframe and then re-run the model.

In [95]:
# Combine winning and losing stats into a single dataframe
winning_stats = games[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
    columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'}
)
winning_stats['Win'] = 1

losing_stats = games[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
    columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'}
)
losing_stats['Win'] = 0

all_stats = pd.concat([winning_stats, losing_stats])

# Compute margin of victory per game
all_stats['MarginOfVictory'] = all_stats['PointsFor'] - all_stats['PointsAgainst']  

# Aggregate by season and team
team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
    avg_points_for=('PointsFor', 'mean'),
    avg_points_against=('PointsAgainst', 'mean'),
    win_pct=('Win', 'mean'),
    avg_margin_of_victory=('MarginOfVictory', 'mean')  # Add margin of victory
).reset_index()


 #### Build matchups, merge in the new stats for both teams

In [96]:
# Merge in stats for Team1
matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team1ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team1_avg_points_for',
    'avg_points_against': 'Team1_avg_points_against',
    'win_pct': 'Team1_win_pct',
    'avg_margin_of_victory': 'Team1_avg_margin_of_victory'  # Add margin of victory
})
matchups = matchups.drop(columns=['TeamID'])

# Merge in stats for Team2
matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team2ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team2_avg_points_for',
    'avg_points_against': 'Team2_avg_points_against',
    'win_pct': 'Team2_win_pct',
    'avg_margin_of_victory': 'Team2_avg_margin_of_victory'  # Add margin of victory
})
matchups = matchups.drop(columns=['TeamID'])

In [97]:
# Update feature columns to include margin of victory
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 'Team1_avg_margin_of_victory',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 'Team2_avg_margin_of_victory'
]

In [98]:
# Build feature matrix and labels
X = matchups[feature_cols]
y = matchups['Team1Won']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy with Margin of Victory: {accuracy:.3f}")

Test Set Accuracy with Margin of Victory: 0.742



 The model with the margin of victory feature has an accuracy of 74.2%.
 This is the same performance for the model without the margin of victory feature.

 The margin of victory feature does not improve the model's performance.


 ## 10. Add Ranking Features
 We are going to use data from this file `MMasseyOrdinals.csv` to add ranking features to the model.
 This file contains the rankings of each team for each season. We will use the rankings to create new features for the model.
 The features we will create are:
 - Team1Rank: The ranking of Team1 for the season
 - Team2Rank: The ranking of Team2 for the season
 - Team1RankDiff: The difference between the rankings of Team1 and Team2
 - Team1RankDiffAbs: The absolute difference between the rankings of Team1 and Team2
 - Team1RankDiffPct: The percentage difference between the rankings of Team1 and Team2
 - Team1RankDiffPctAbs: The absolute percentage difference between the rankings of Team1 and Team2

In [99]:
rankings = dfs['MMasseyOrdinals']

In [100]:
# keep the columns we need
rankings = rankings[['Season', 'TeamID', 'OrdinalRank']]

In [101]:
# keep the average ranking by season for each team
rankings = rankings.groupby(['Season', 'TeamID']).agg(
    OrdinalRank=('OrdinalRank', 'mean')
).reset_index()


 #### Merge in rankings for Team1

In [102]:
matchups = matchups.merge(
    rankings, how='left',
    left_on=['Season', 'Team1ID'],
    right_on=['Season', 'TeamID']
)

In [103]:
# drop TeamID from the matchups dataframe
matchups = matchups.drop(columns=['TeamID'])

In [104]:
# rename the column of the matchups dataframe for Team1Rank
matchups = matchups.rename(columns={'OrdinalRank': 'Team1Rank'})



 #### Merge in rankings for Team2

In [105]:
matchups = matchups.merge(
    rankings, how='left',
    left_on=['Season', 'Team2ID'],
    right_on=['Season', 'TeamID']
)

In [106]:
# drop TeamID from the matchups dataframe and rename column for Team2Rank
matchups = matchups.drop(columns=['TeamID'])
matchups = matchups.rename(columns={'OrdinalRank': 'Team2Rank'})



 #### Check for missing values in the rankings

In [107]:
matchups.isna().sum()

Season                             0
WTeamID                            0
LTeamID                            0
Team1ID                            0
Team2ID                            0
Team1Won                           0
Team1_avg_points_for               0
Team1_avg_points_against           0
Team1_win_pct                      0
Team2_avg_points_for               0
Team2_avg_points_against           0
Team2_win_pct                      0
Team1_avg_points_for               0
Team1_avg_points_against           0
Team1_win_pct                      0
Team1_avg_margin_of_victory        0
Team2_avg_points_for               0
Team2_avg_points_against           0
Team2_win_pct                      0
Team2_avg_margin_of_victory        0
Team1Rank                      74048
Team2Rank                      74048
dtype: int64

In [108]:
# drop records with missing values in the rankings
matchups = matchups.dropna(subset=['Team1Rank', 'Team2Rank'])

In [109]:
# Calculate ranking differences
matchups['Team1RankDiff'] = matchups['Team1Rank'] - matchups['Team2Rank']
matchups['Team1RankDiffAbs'] = matchups['Team1RankDiff'].abs()
matchups['Team1RankDiffPct'] = matchups['Team1RankDiff'] / matchups['Team2Rank']
matchups['Team1RankDiffPctAbs'] = matchups['Team1RankDiffPct'].abs()
matchups

Unnamed: 0,Season,WTeamID,LTeamID,Team1ID,Team2ID,Team1Won,Team1_avg_points_for,Team1_avg_points_against,Team1_win_pct,Team2_avg_points_for,...,Team2_avg_points_for.1,Team2_avg_points_against,Team2_win_pct,Team2_avg_margin_of_victory,Team1Rank,Team2Rank,Team1RankDiff,Team1RankDiffAbs,Team1RankDiffPct,Team1RankDiffPctAbs
74048,2003,1104,1328,1328,1104,0,71.166667,60.166667,0.800000,69.285714,...,69.285714,65.000000,0.607143,4.285714,15.730233,27.655502,-11.925270,11.925270,-0.431208,0.431208
74049,2003,1272,1393,1272,1393,1,74.517241,65.827586,0.793103,80.103448,...,80.103448,69.896552,0.827586,10.206897,42.000000,25.596154,16.403846,16.403846,0.640872,0.640872
74050,2003,1266,1437,1266,1437,1,78.392857,67.678571,0.821429,72.200000,...,72.200000,70.100000,0.500000,2.100000,18.967442,60.385000,-41.417558,41.417558,-0.685891,0.685891
74051,2003,1296,1457,1296,1457,1,69.612903,69.806452,0.548387,69.428571,...,69.428571,66.392857,0.642857,3.035714,147.512500,209.452500,-61.940000,61.940000,-0.295723,0.295723
74052,2003,1400,1208,1208,1400,0,79.185185,73.185185,0.703704,78.857143,...,78.857143,68.678571,0.785714,10.178571,19.261283,9.416279,9.845004,9.845004,1.045530,1.045530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192925,2025,1196,1397,1196,1397,1,85.411765,69.235294,0.882353,74.735294,...,74.735294,63.029412,0.794118,11.705882,6.797267,4.831435,1.965831,1.965831,0.406884,0.406884
192926,2025,1272,1412,1272,1412,1,80.147059,73.235294,0.852941,81.727273,...,81.727273,76.606061,0.636364,5.121212,30.122685,136.150485,-106.027800,106.027800,-0.778754,0.778754
192927,2025,1276,1458,1276,1458,1,78.264706,71.235294,0.735294,79.714286,...,79.714286,70.428571,0.742857,9.285714,24.207898,20.290993,3.916905,3.916905,0.193037,0.193037
192928,2025,1433,1206,1206,1433,0,70.030303,63.545455,0.757576,76.333333,...,76.333333,62.515152,0.818182,13.818182,81.258495,51.837576,29.420919,29.420919,0.567560,0.567560


In [110]:
# Update feature columns to include ranking features
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 'Team1_avg_margin_of_victory',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 'Team2_avg_margin_of_victory',
    'Team1Rank', 'Team2Rank', 'Team1RankDiff', 'Team1RankDiffAbs', 'Team1RankDiffPct', 'Team1RankDiffPctAbs'
]

In [111]:
# Build feature matrix and labels
X = matchups[feature_cols]
y = matchups['Team1Won']

In [112]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [114]:
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [115]:
# Evaluate
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy with rankings: {accuracy:.3f}")

Test Set Accuracy with rankings: 0.758


 ## 11. Conclusion

 The model with the margin of victory and rankings features has an accuracy of 75.8%.
 This is an improvement over the model without these features, which had an accuracy of 74.2%.

 The model with the margin of victory and rankings features is a better predictor of the outcome of a game than the model without these features.