 # Logistic Regression
 ## 1. Import data to dataframes

In [237]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


def load_data():
    # import the data
    current_dir = os.getcwd()
    basics_dir_path = os.path.join(current_dir, '..', 'data', 'section_1_basics')
    team_box_scores_dir_path = os.path.join(current_dir, '..', 'data', 'section_2_team_box_scores')
    geography_dir_path = os.path.join(current_dir, '..', 'data', 'section_3_geography')
    public_rankings_dir_path = os.path.join(current_dir, '..', 'data', 'section_4_public_rankings')
    supplements_dir_path = os.path.join(current_dir, '..','data', 'section_5_supplements')

    dfs = {}
    for path in [basics_dir_path, team_box_scores_dir_path, geography_dir_path, public_rankings_dir_path, supplements_dir_path]:
        for filename in os.listdir(path):
            if filename.endswith(".csv"):
                filepath = os.path.join(path, filename)
                df_name = filename[:-4]  # Remove the .csv extension
                dfs[df_name] = pd.read_csv(filepath)
    return dfs  
dfs = load_data()
dfs.keys()

dict_keys(['MNCAATourneyCompactResults', 'MNCAATourneySeeds', 'MRegularSeasonCompactResults', 'MSeasons', 'MTeams', 'WNCAATourneyCompactResults', 'WNCAATourneySeeds', 'WRegularSeasonCompactResults', 'WSeasons', 'WTeams', 'MNCAATourneyDetailedResults', 'MRegularSeasonDetailedResults', 'WNCAATourneyDetailedResults', 'WRegularSeasonDetailedResults', 'Cities', 'MGameCities', 'WGameCities', 'MMasseyOrdinals', 'Conferences', 'MConferenceTourneyGames', 'MNCAATourneySeedRoundSlots', 'MNCAATourneySlots', 'MSecondaryTourneyCompactResults', 'MSecondaryTourneyTeams', 'MTeamCoaches', 'MTeamConferences', 'MTeamSpellings', 'WConferenceTourneyGames', 'WNCAATourneySlots', 'WSecondaryTourneyCompactResults', 'WSecondaryTourneyTeams', 'WTeamConferences', 'WTeamSpellings'])

 ## 2. For this model we are going to use the data from `MRegularSeasonCompactResults.csv`

In [238]:
games = dfs['MRegularSeasonCompactResults']

 ## 3. Create a dataframe containing the regular game statistics for each match of each season. 
 #### Aggregate the data see the average points scored, average points scored against, and win percentage


In [239]:
def build_team_stats(df):
    winning_stats = df[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
    columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'}
    )
    winning_stats['Win'] = 1

    losing_stats = df[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
        columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'}
    )
    losing_stats['Win'] = 0

    all_stats = pd.concat([winning_stats, losing_stats])

    # all_stats['MarginOfVictory'] = all_stats['PointsFor'] - all_stats['PointsAgainst']

    # aggregate data to see averaage points for and against, and win percentage
    team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
    avg_points_for=('PointsFor', 'mean'),
    avg_points_against=('PointsAgainst', 'mean'),
    win_pct=('Win', 'mean')
    # avg_margin_of_victory=('MarginOfVictory', 'mean')  # Add margin of victory
    ).reset_index()

    return team_stats

team_stats = build_team_stats(games)


 ## 4. Create dataframe containing the matchups and merge in data from the statistics dataframe

In [240]:
def generate_matchups(games, team_stats, dfs):
    # Create labeled matchup data from real games
    matchups = games[['Season', 'WTeamID', 'LTeamID']].copy()
    matchups['Team1ID'] = matchups['WTeamID']
    matchups['Team2ID'] = matchups['LTeamID']
    matchups['Team1Won'] = 1

    # Merge in features for both teams
    for i in [1, 2]:
        matchups = matchups.merge(
            team_stats,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        )
        matchups = matchups.rename(columns={
            'avg_points_for': f'Team{i}_avg_points_for',
            'avg_points_against': f'Team{i}_avg_points_against',
            'win_pct': f'Team{i}_win_pct'
            # 'avg_margin_of_victory': f'Team{i}_avg_margin_of_victory'  # Add margin of victory
        })
        matchups.drop(columns=['TeamID'], inplace=True)

    # randomly swap Team1 and Team2 
    np.random.seed(42)

    # Create a random boolean array: True means "swap"
    swap_mask = np.random.rand(len(matchups)) < 0.5

    # Swap team IDs by matching the mask
    matchups.loc[swap_mask, ['Team1ID', 'Team2ID']] = matchups.loc[swap_mask, ['Team2ID', 'Team1ID']].values
    for feature in ['avg_points_for', 'avg_points_against', 'win_pct']:
        team1_feature = f'Team1_{feature}'
        team2_feature = f'Team2_{feature}'
        matchups.loc[swap_mask, [team1_feature, team2_feature]] = matchups.loc[swap_mask, [team2_feature, team1_feature]].values

    # Set the target: 1 if original Team1 won, 0 if swapped
    matchups['Team1Won'] = (~swap_mask).astype(int)

    # add ordinal rankings to matchups
    # matchups = merge_rankings(matchups, dfs)

    return matchups

matchups = generate_matchups(games, team_stats, dfs)


 ## 5. Build and test model

In [241]:

feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct'
]

X = matchups[feature_cols]
y = matchups['Team1Won']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# fit the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.7421


#### This model gives an accuracy of 74.21%

 ## 6. Add margin of victory to team stats to improve the model
 The margin of victory is the difference between the points scored by the winning team and the points scored by the losing team.
 This is a common feature used in sports analytics to predict the outcome of games.
 The margin of victory is a good predictor of the outcome of a game because it takes into account the strength of both teams.
 A team that wins by a large margin is likely to be stronger than a team that wins by a small margin.

 To do this, we will add the margin of victory to the team stats dataframe and then re-run the model.

In [242]:
def build_team_stats(df):
    winning_stats = df[['Season', 'WTeamID', 'WScore', 'LScore']].rename(
    columns={'WTeamID': 'TeamID', 'WScore': 'PointsFor', 'LScore': 'PointsAgainst'}
    )
    winning_stats['Win'] = 1

    losing_stats = df[['Season', 'LTeamID', 'LScore', 'WScore']].rename(
        columns={'LTeamID': 'TeamID', 'LScore': 'PointsFor', 'WScore': 'PointsAgainst'}
    )
    losing_stats['Win'] = 0

    all_stats = pd.concat([winning_stats, losing_stats])

    all_stats['MarginOfVictory'] = all_stats['PointsFor'] - all_stats['PointsAgainst']

    # aggregate data to see averaage points for and against, and win percentage
    team_stats = all_stats.groupby(['Season', 'TeamID']).agg(
    avg_points_for=('PointsFor', 'mean'),
    avg_points_against=('PointsAgainst', 'mean'),
    win_pct=('Win', 'mean'),
    avg_margin_of_victory=('MarginOfVictory', 'mean')  # Add margin of victory
    ).reset_index()

    return team_stats

team_stats = build_team_stats(games)

In [243]:
def generate_matchups(games, team_stats, dfs):
    # Create labeled matchup data from real games
    matchups = games[['Season', 'WTeamID', 'LTeamID']].copy()
    matchups['Team1ID'] = matchups['WTeamID']
    matchups['Team2ID'] = matchups['LTeamID']
    matchups['Team1Won'] = 1

    # Merge in features for both teams
    for i in [1, 2]:
        matchups = matchups.merge(
            team_stats,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        )
        matchups = matchups.rename(columns={
            'avg_points_for': f'Team{i}_avg_points_for',
            'avg_points_against': f'Team{i}_avg_points_against',
            'win_pct': f'Team{i}_win_pct',
            'avg_margin_of_victory': f'Team{i}_avg_margin_of_victory'  # Add margin of victory
        })
        matchups.drop(columns=['TeamID'], inplace=True)

    # randomly swap Team1 and Team2 
    np.random.seed(42)

    # Create a random boolean array: True means "swap"
    swap_mask = np.random.rand(len(matchups)) < 0.5

    # Swap team IDs by matching the mask
    matchups.loc[swap_mask, ['Team1ID', 'Team2ID']] = matchups.loc[swap_mask, ['Team2ID', 'Team1ID']].values
    for feature in ['avg_points_for', 'avg_points_against', 'win_pct']:
        team1_feature = f'Team1_{feature}'
        team2_feature = f'Team2_{feature}'
        matchups.loc[swap_mask, [team1_feature, team2_feature]] = matchups.loc[swap_mask, [team2_feature, team1_feature]].values

    # Set the target: 1 if original Team1 won, 0 if swapped
    matchups['Team1Won'] = (~swap_mask).astype(int)

    # add ordinal rankings to matchups
    # matchups = merge_rankings(matchups, dfs)

    return matchups

matchups = generate_matchups(games, team_stats, dfs)

In [244]:
# Update feature columns to include margin of victory
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 'Team1_avg_margin_of_victory',
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 'Team2_avg_margin_of_victory'
]
# Build feature matrix and labels
X = matchups[feature_cols]
y = matchups['Team1Won']
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy with Margin of Victory: {accuracy:.4f}")

Test Set Accuracy with Margin of Victory: 0.7419



 The model with the margin of victory feature has an accuracy of 74.19%.
 This is a decrease in performance for the model without the margin of victory feature.


 ## 6. Add Ranking Features
 We are going to use data from this file `MMasseyOrdinals.csv` to add ranking features to the model.
 This file contains the rankings of each team for each season. We will use the rankings to create new features for the model.
 The features we will create are:
 - Team1Rank: The ranking of Team1 for the season
 - Team2Rank: The ranking of Team2 for the season
 - Team1RankDiff: The difference between the rankings of Team1 and Team2
 - Team1RankDiffAbs: The absolute difference between the rankings of Team1 and Team2
 - Team1RankDiffPct: The percentage difference between the rankings of Team1 and Team2
 - Team1RankDiffPctAbs: The absolute percentage difference between the rankings of Team1 and Team2

In [245]:
def merge_rankings(matchups, dfs):
    # Load the rankings data
    rankings = dfs['MMasseyOrdinals']

    # keep only the relevant columns
    rankings = rankings[['Season', 'TeamID', 'OrdinalRank']]
    
    # keep the average ranking by season for each team
    rankings = rankings.groupby(['Season', 'TeamID']).agg(
        OrdinalRank=('OrdinalRank', 'mean')
    ).reset_index()

    # Add ordinal rankings for both Team1 and Team2
    for i in [1, 2]:
        matchups = matchups.merge(
            rankings, how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        )
        matchups = matchups.drop(columns=['TeamID'])
        matchups = matchups.rename(columns={'OrdinalRank': f'Team{i}Rank'})

    # drop records with missing values in the rankings
    matchups = matchups.dropna(subset=['Team1Rank', 'Team2Rank'])

    # calculate rank difference
    matchups['Team1RankDiff'] = matchups['Team1Rank'] - matchups['Team2Rank']
    matchups['Team1RankDiffAbs'] = matchups['Team1RankDiff'].abs()
    matchups['Team1RankDiffPct'] = matchups['Team1RankDiff'] / matchups['Team2Rank']
    matchups['Team1RankDiffPctAbs'] = matchups['Team1RankDiffPct'].abs()

    return matchups

In [246]:
def generate_matchups(games, team_stats, dfs):
    # Create labeled matchup data from real games
    matchups = games[['Season', 'WTeamID', 'LTeamID']].copy()
    matchups['Team1ID'] = matchups['WTeamID']
    matchups['Team2ID'] = matchups['LTeamID']
    matchups['Team1Won'] = 1

    # Merge in features for both teams
    for i in [1, 2]:
        matchups = matchups.merge(
            team_stats,
            how='left',
            left_on=['Season', f'Team{i}ID'],
            right_on=['Season', 'TeamID']
        )
        matchups = matchups.rename(columns={
            'avg_points_for': f'Team{i}_avg_points_for',
            'avg_points_against': f'Team{i}_avg_points_against',
            'win_pct': f'Team{i}_win_pct',
            'avg_margin_of_victory': f'Team{i}_avg_margin_of_victory'  # Add margin of victory
        })
        matchups.drop(columns=['TeamID'], inplace=True)

    # randomly swap Team1 and Team2 
    np.random.seed(42)

    # Create a random boolean array: True means "swap"
    swap_mask = np.random.rand(len(matchups)) < 0.5

    # Swap team IDs by matching the mask
    matchups.loc[swap_mask, ['Team1ID', 'Team2ID']] = matchups.loc[swap_mask, ['Team2ID', 'Team1ID']].values
    for feature in ['avg_points_for', 'avg_points_against', 'win_pct']:
        team1_feature = f'Team1_{feature}'
        team2_feature = f'Team2_{feature}'
        matchups.loc[swap_mask, [team1_feature, team2_feature]] = matchups.loc[swap_mask, [team2_feature, team1_feature]].values

    # Set the target: 1 if original Team1 won, 0 if swapped
    matchups['Team1Won'] = (~swap_mask).astype(int)

    # add ordinal rankings to matchups
    matchups = merge_rankings(matchups, dfs)

    return matchups
matchups = generate_matchups(games, team_stats, dfs)

In [247]:
# Update feature columns to include ranking features
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 
    'Team1Rank', 'Team2Rank', 'Team1RankDiff', 'Team1RankDiffAbs', 'Team1RankDiffPct', 'Team1RankDiffPctAbs'
]
# Build feature matrix and labels
X = matchups[feature_cols]
y = matchups['Team1Won']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy with rankings: {accuracy:.10f}")

Test Set Accuracy with rankings: 0.7592947747


#### Logistic regression model with the ordinal ranking data had an accuracy of 75.93%

# 7. Add seed data

In [251]:
def matchups_with_seed_data(matchups, dfs):
    # add seed data to matchups
    # generate matchups with seed data
    seeds = dfs['MNCAATourneySeeds']
    seeds['SeedNum'] = seeds['Seed'].str.extract('(\\d+)').astype(int)

    # Team1 seed
    matchups = matchups.merge(seeds[['Season', 'TeamID', 'SeedNum']], left_on=['Season', 'Team1ID'], right_on=['Season', 'TeamID'], how='left')
    matchups = matchups.rename(columns={'SeedNum': 'Team1Seed'}).drop(columns=['TeamID'])

    # Team2 seed
    matchups = matchups.merge(seeds[['Season', 'TeamID', 'SeedNum']], left_on=['Season', 'Team2ID'], right_on=['Season', 'TeamID'], how='left')
    matchups = matchups.rename(columns={'SeedNum': 'Team2Seed'}).drop(columns=['TeamID'])

    # Fill missing seeds with 17 (meaning worse than 16-seed)
    matchups.fillna({'Team1Seed': 17, 'Team2Seed': 17}, inplace=True)


    return matchups
matchups = generate_matchups(games, team_stats, dfs)
seed_matchups = matchups_with_seed_data(matchups, dfs)

In [252]:
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 
    'Team1Rank', 'Team2Rank', 'Team1RankDiff', 'Team1RankDiffAbs', 'Team1RankDiffPct', 'Team1RankDiffPctAbs',
    'Team1Seed', 'Team2Seed'
]
X = seed_matchups[feature_cols]
y = seed_matchups['Team1Won']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy with seed data: {accuracy:.10f}")

Model accuracy with seed data: 0.7591265435


#### Logistic regression model with seed data had an accuracy of 75.91%.

 The model with the ordinal rankings feature is a better predictor of the outcome of a game, scoring an accuracy of 75.93%, than the model without these features. 

# 8. Random Forest
We will see if using the Random Forest Algorithm will produce better accuracy

In [254]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# matchups = generate_matchups(games, team_stats, dfs)
feature_cols = [
    'Team1_avg_points_for', 'Team1_avg_points_against', 'Team1_win_pct', 
    'Team2_avg_points_for', 'Team2_avg_points_against', 'Team2_win_pct', 
    'Team1Rank', 'Team2Rank', 'Team1RankDiff', 'Team1RankDiffAbs', 'Team1RankDiffPct', 'Team1RankDiffPctAbs',
]
# Split data
X = matchups[feature_cols]
y = matchups['Team1Won']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7197291500189259


### Using this random forest model, we get an accuracy of 71.97%

In [255]:
# change n_estimators to 300
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))


Accuracy: 0.719140345712243


### Increasing the n_estimators to 300 decreased the accuracy from 71.97% to 71.91% which is not much difference.