
# Logistic Regression
## 1. Import data to dataframes

In [114]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# import the data
current_dir = os.getcwd()
basics_dir_path = os.path.join(current_dir, '..', 'data', 'section_1_basics')
team_box_scores_dir_path = os.path.join(current_dir, '..', 'data', 'section_2_team_box_scores')
geography_dir_path = os.path.join(current_dir, '..', 'data', 'section_3_geography')
public_rankings_dir_path = os.path.join(current_dir, '..', 'data', 'section_4_public_rankings')
supplements_dir_path = os.path.join(current_dir, '..', 'data', 'section_5_supplements')

In [115]:
dfs = {}
for path in [basics_dir_path, team_box_scores_dir_path, geography_dir_path, public_rankings_dir_path, supplements_dir_path]:
  for filename in os.listdir(path):
    if filename.endswith(".csv"):
      filepath = os.path.join(path, filename)
      df_name = filename[:-4]  # Remove the .csv extension
      dfs[df_name] = pd.read_csv(filepath)
dfs.keys()

dict_keys(['MNCAATourneyCompactResults', 'MNCAATourneySeeds', 'MRegularSeasonCompactResults', 'MSeasons', 'MTeams', 'WNCAATourneyCompactResults', 'WNCAATourneySeeds', 'WRegularSeasonCompactResults', 'WSeasons', 'WTeams', 'MNCAATourneyDetailedResults', 'MRegularSeasonDetailedResults', 'WNCAATourneyDetailedResults', 'WRegularSeasonDetailedResults', 'Cities', 'MGameCities', 'WGameCities', 'MMasseyOrdinals', 'Conferences', 'MConferenceTourneyGames', 'MNCAATourneySeedRoundSlots', 'MNCAATourneySlots', 'MSecondaryTourneyCompactResults', 'MSecondaryTourneyTeams', 'MTeamCoaches', 'MTeamConferences', 'MTeamSpellings', 'WConferenceTourneyGames', 'WNCAATourneySlots', 'WSecondaryTourneyCompactResults', 'WSecondaryTourneyTeams', 'WTeamConferences', 'WTeamSpellings'])

## 2. For this model we are going to use the data from `MRegularSeasonCompactResults.csv`

In [116]:
games = dfs['MRegularSeasonCompactResults']
games

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0
...,...,...,...,...,...,...,...,...
192925,2025,132,1196,86,1397,77,N,0
192926,2025,132,1272,84,1412,72,N,0
192927,2025,132,1276,59,1458,53,N,0
192928,2025,132,1433,68,1206,63,N,0


## 3. Create a dataframe containing the regular game statistics for each match of each season.

In [117]:
winning_stats = games[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
    columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'}
)
winning_stats['Win'] = 1

losing_stats = games[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
    columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'}
)
losing_stats['Win'] = 0

all_stats = pd.concat([winning_stats, losing_stats])
all_stats

Unnamed: 0,Season,TeamID,PointsFor,PointsAgainst,Win
0,1985,1228,81,64,1
1,1985,1106,77,70,1
2,1985,1112,63,56,1
3,1985,1165,70,54,1
4,1985,1192,86,74,1
...,...,...,...,...,...
192925,2025,1397,77,86,0
192926,2025,1412,72,84,0
192927,2025,1458,53,59,0
192928,2025,1206,63,68,0


#### Aggregate the data see the average points scored, average points scored against, and win percentage

In [118]:
team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
    avg_points_for=('PointsFor', 'mean'),
    avg_points_against=('PointsAgainst', 'mean'),
    win_pct=('Win', 'mean')
).reset_index()
team_stats

Unnamed: 0,Season,TeamID,avg_points_for,avg_points_against,win_pct
0,1985,1102,63.083333,68.875000,0.208333
1,1985,1103,61.043478,64.086957,0.391304
2,1985,1104,68.500000,60.700000,0.700000
3,1985,1106,71.625000,75.416667,0.416667
4,1985,1108,83.000000,75.040000,0.760000
...,...,...,...,...,...
13383,2025,1476,67.333333,70.900000,0.433333
13384,2025,1477,64.354839,74.870968,0.161290
13385,2025,1478,71.933333,81.400000,0.233333
13386,2025,1479,65.785714,71.750000,0.428571


## 4. Create dataframe containing the matchups and merge in data from the statistics dataframe

In [119]:
matchups = games[['Season', 'WTeamID', 'LTeamID']].copy()
matchups['Team1ID'] = matchups['WTeamID']
matchups['Team2ID'] = matchups['LTeamID']
matchups['Team1Won'] = 1  # Because WTeamID is the winner in your dataset
matchups

Unnamed: 0,Season,WTeamID,LTeamID,Team1ID,Team2ID,Team1Won
0,1985,1228,1328,1228,1328,1
1,1985,1106,1354,1106,1354,1
2,1985,1112,1223,1112,1223,1
3,1985,1165,1432,1165,1432,1
4,1985,1192,1447,1192,1447,1
...,...,...,...,...,...,...
192925,2025,1196,1397,1196,1397,1
192926,2025,1272,1412,1272,1412,1
192927,2025,1276,1458,1276,1458,1
192928,2025,1433,1206,1433,1206,1


#### Merge in team stats for Team1

In [121]:
matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team1ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team1_avg_points_for',
    'avg_points_against': 'Team1_avg_points_against',
    'win_pct': 'Team1_win_pct'
})
matchups = matchups.drop(columns=['TeamID'])

#### Merge in stats for Team2

In [123]:
# merge stats for Team2
matchups = matchups.merge(
    team_stats, how='left',
    left_on=['Season', 'Team2ID'],
    right_on=['Season', 'TeamID']
)
matchups = matchups.rename(columns={
    'avg_points_for': 'Team2_avg_points_for',
    'avg_points_against': 'Team2_avg_points_against',
    'win_pct': 'Team2_win_pct'
})
matchups = matchups.drop(columns=['TeamID'])

#### Randomly swap Team1 and Team2 to create 2 classes for the 'Team1' column

In [124]:
np.random.seed(42)  # for reproducibility

# Create a random boolean array: True means "swap"
swap_mask = np.random.rand(len(matchups)) < 0.5

# Swap team IDs by matching the mask
matchups.loc[swap_mask, ['Team1ID', 'Team2ID']] = matchups.loc[swap_mask, ['Team2ID', 'Team1ID']].values
for feature in ['avg_points_for', 'avg_points_against', 'win_pct']:
    team1_feature = f'Team1_{feature}'
    team2_feature = f'Team2_{feature}'
    matchups.loc[swap_mask, [team1_feature, team2_feature]] = matchups.loc[swap_mask, [team2_feature, team1_feature]].values

# Set the target: 1 if original Team1 won, 0 if swapped
matchups['Team1Won'] = (~swap_mask).astype(int)

## 5. Build feature matrix and labels

In [125]:
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct'
]

X = matchups[feature_cols]
y = matchups['Team1Won']

#### Train/test split

In [126]:
# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 6. Scale features

In [127]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 7. Train the model

In [129]:
# Step 7: Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

## 8. Evaluate

In [130]:

y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.3f}")

Test Set Accuracy: 0.742
