In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import datetime as dt
import random
from sportsipy.ncaab.teams import Teams
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from matplotlib import pyplot
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
#https://towardsdatascience.com/sports-reference-api-intro-dbce09e89e52
#from sportsreference.ncaab.roster import Roster
# OSU2020 = Roster('OHIO-STATE', year = '2020')
# for player in OSU2020.players:
#     print(player.name, ':', player.player_id)
    


# DATA Prep

## Web Scraping advanced Data

In [3]:
teams_df = pd.DataFrame()
def load_teams (start_year,end_year):
    basic_url = 'https://www.sports-reference.com/cbb/seasons/men/{}-school-stats.html'
    from bs4 import BeautifulSoup
    import requests
    import time
    basic_table_final = pd.DataFrame()
    adv_table_final = pd.DataFrame()
    opp_basic_table_final = pd.DataFrame()
    opp_adv_table_final = pd.DataFrame()
    years = range(start_year,end_year)
    dnp = ['BROWN','COLUMBIA','CORNELL','DARTMOUTH','HARVARD','PENNSYLVANIA','PRINCETON','YALE','MARYLAND-EASTERN SHORE',
       'CHICAGO STATE','HOWARD','MAINE','CHARLESTON SOUTHERN','JACKSONVILLE','HOLY CROSS']
    for year in years:
        if year == 2020:
            print(year, "skipped for Covid-19")
            year += 1
        elif year == 2021:
            print('Starting Basic Table Year {}'.format(year))
            url = basic_url.format(year)
            data = requests.get(url)
            with open("web_scrape/{}.html".format(year),"w+") as f:
                 f.write(data.text)
            with open("web_scrape/{}.html".format(year)) as f:
                page = f.read()
            soup = BeautifulSoup(page,"html.parser")
            soup.find('tr',class_="over_header").decompose()
            soup.find('tr',class_="thead").decompose()
            soup.find('tr',class_="over_header thead").decompose()
            table = soup.find(id="basic_school_stats")
            basic_table = pd.read_html(str(table))[0]
            basic_table = basic_table.drop(['Unnamed: 8','Unnamed: 11','Unnamed: 14','Unnamed: 17','Unnamed: 20'], axis=1)
            basic_table = basic_table.rename(columns={'W.1': 'conf_wins', 'L.1': 'conf_losses','W.2': 'home_wins','L.2':'home_losses','W.3': 'away_wins','L.3': 'away_losses',
                                                     'FT%': 'free_throw_percentage','3P%': 'three_point_field_goal_percentage','SRS': 'simple_rating_system','TOV' : 'turnovers','STL' : 'steals','TRB' : 'rebounds'})
            basic_table['Season']= year
            basic_table['School']=  basic_table['School'].str.upper()
            basic_table = basic_table[~basic_table.School.isin(dnp)]
            basic_table_final = basic_table_final.append(basic_table, ignore_index=True)
            basic_table_final['School'] = basic_table_final['School'].str.replace('NCAA', '')
            basic_table_final['School'] = basic_table_final['School'].astype(str)
            basic_table_final['School'] =  basic_table_final['School'].apply(lambda x: x.strip())
            basic_table_final = basic_table_final[basic_table_final.G != 'Overall']
            basic_table_final = basic_table_final[basic_table_final.Rk != 'Rk']
            basic_table_final['Tm.']= pd.to_numeric(basic_table_final['Tm.'])
            basic_table_final['Opp.']= pd.to_numeric(basic_table_final['Opp.'])
            basic_table_final['G']= pd.to_numeric(basic_table_final['G'])
            basic_table_final['avg_mov'] = (basic_table_final['Tm.']-basic_table_final['Opp.']) / basic_table_final['G']
            basic_table_final = basic_table_final[['Season','School','G','Tm.','Opp.','free_throw_percentage', 'three_point_field_goal_percentage','simple_rating_system','avg_mov','steals','turnovers','rebounds']]
            time.sleep(3)
            
        else:
            print('Starting Basic Table Year {}'.format(year))
            url = basic_url.format(year)
            data = requests.get(url)
            #time.sleep(10)
            with open("web_scrape/{}.html".format(year),"w+") as f:
                 f.write(data.text)
            with open("web_scrape/{}.html".format(year)) as f:
                page = f.read()
            soup = BeautifulSoup(page,"html.parser")
            soup.find('tr',class_="over_header").decompose()
            soup.find('tr',class_="thead").decompose()
            soup.find('tr',class_="over_header thead").decompose()
            table = soup.find(id="basic_school_stats")
            basic_table = pd.read_html(str(table))[0]
            basic_table = basic_table.drop(['Unnamed: 8','Unnamed: 11','Unnamed: 14','Unnamed: 17','Unnamed: 20'], axis=1)
            basic_table = basic_table.rename(columns={'W.1': 'conf_wins', 'L.1': 'conf_losses','W.2': 'home_wins','L.2':'home_losses','W.3': 'away_wins','L.3': 'away_losses',
                                                     'FT%': 'free_throw_percentage','3P%': 'three_point_field_goal_percentage','SRS': 'simple_rating_system','TOV' : 'turnovers',
                                                      'STL' : 'steals','TRB' : 'rebounds','AST' : 'assists'})
            basic_table['Season']= year
            basic_table['School']=  basic_table['School'].str.upper()
            basic_table_final = basic_table_final.append(basic_table, ignore_index=True)
            basic_table_final['School'] = basic_table_final['School'].str.replace('NCAA', '')
            basic_table_final['School'] = basic_table_final['School'].astype(str)
            basic_table_final['School'] =  basic_table_final['School'].apply(lambda x: x.strip())
            basic_table_final = basic_table_final[basic_table_final.G != 'Overall']
            basic_table_final = basic_table_final[basic_table_final.Rk != 'Rk']
            basic_table_final['Tm.']= pd.to_numeric(basic_table_final['Tm.'])
            basic_table_final['Opp.']= pd.to_numeric(basic_table_final['Opp.'])
            basic_table_final['G']= pd.to_numeric(basic_table_final['G'])
            basic_table_final['avg_mov'] = (basic_table_final['Tm.']-basic_table_final['Opp.']) / basic_table_final['G']
            basic_table_final = basic_table_final[['Season','School','G','Tm.','Opp.','free_throw_percentage', 'three_point_field_goal_percentage',
                                                   'simple_rating_system','avg_mov','steals','turnovers','rebounds','assists']]
            time.sleep(3)
            teams_df = basic_table_final
        
    #ADV TABLE
    time.sleep(20)
    adv_url = 'https://www.sports-reference.com/cbb/seasons/men/{}-advanced-school-stats.html'
    for year in years:
        print('Starting Advanced Table Year {}'.format(year))
        url = adv_url.format(year)
        data = requests.get(url)
        with open("web_scrape/{}.html".format(year),"w+") as f:
             f.write(data.text)
        with open("web_scrape/{}.html".format(year)) as f:
            page = f.read()
        soup = BeautifulSoup(page,"html.parser")
        soup.find('tr',class_="over_header").decompose()
        soup.find('tr',class_="thead").decompose()
        soup.find('tr',class_="over_header thead").decompose()
        table = soup.find(id="adv_school_stats")
        adv_table = pd.read_html(str(table))[0]
        adv_table = adv_table.drop([ 'W', 'L', 'W-L%', 'SRS', 'SOS', 'Unnamed: 8',
           'W.1', 'L.1', 'Unnamed: 11', 'W.2', 'L.2', 'Unnamed: 14', 'W.3', 'L.3',
           'Unnamed: 17', 'Tm.', 'Opp.', 'Unnamed: 20', 'STL%','ORB%'], axis=1)
        adv_table = adv_table.rename(columns={'Pace': 'pace', 'ORtg': 'offensive_rating', 'FTr': 'free_throw_attempt_rate', '3PAr': 'three_point_attempt_rate', 'TS%': 'true_shooting_percentage',
                                            'TRB%': 'total_rebound_percentage', 'BLK%': 'block_percentage','eFG%': 'effective_field_goal_percentage', 'TOV%': 'turnover_percentage',
                                              'FT/FGA': 'free_throws_per_field_goal_attempt','AST%': 'assist_percentage'})
        adv_table['Season']= year
        adv_table['School']=  adv_table['School'].str.upper()
        adv_table_final = adv_table_final.append(adv_table, ignore_index=True)
        adv_table_final['School'] = adv_table_final['School'].str.replace('NCAA', '')
        adv_table_final['School'] = adv_table_final['School'].astype(str)
        adv_table_final['School'] =  adv_table_final['School'].apply(lambda x: x.strip())
        adv_table_final = adv_table_final[adv_table_final.G != 'Overall']
        adv_table_final = adv_table_final[adv_table_final.Rk != 'Rk']
        adv_table_final = adv_table_final[['Season','School','pace', 'offensive_rating',
           'free_throw_attempt_rate', 'three_point_attempt_rate',
           'true_shooting_percentage', 'total_rebound_percentage',
           'block_percentage', 'effective_field_goal_percentage',
           'turnover_percentage', 'free_throws_per_field_goal_attempt','assist_percentage']]
        time.sleep(3)
    #OPP BASIC TABLE
    time.sleep(20)
    opp_basic_url = 'https://www.sports-reference.com/cbb/seasons/men/{}-opponent-stats.html'
    for year in years:
        print('Starting Opponent Basic Table Year {}'.format(year))
        url = opp_basic_url.format(year)
        data = requests.get(url)
        with open("web_scrape/{}.html".format(year),"w+") as f:
             f.write(data.text)
        with open("web_scrape/{}.html".format(year)) as f:
            page = f.read()
        soup = BeautifulSoup(page,"html.parser")
        soup.find('tr',class_="over_header").decompose()
        soup.find('tr',class_="thead").decompose()
        soup.find('tr',class_="over_header thead").decompose()
        table = soup.find(id="basic_opp_stats")
        opp_basic_table = pd.read_html(str(table))[0]
        opp_basic_table = opp_basic_table.rename(columns={'3P%': 'opp three point fg perc','TOV' : 'opp_turnovers','STL' : 'opp_steals','TRB' : 'opp_rebounds','AST' : 'opp_assists'})
        opp_basic_table['Season']= year
        opp_basic_table['School']=  opp_basic_table['School'].str.upper()
        opp_basic_table_final = opp_basic_table_final.append(opp_basic_table, ignore_index=True)
        opp_basic_table_final['School'] = opp_basic_table_final['School'].str.replace('NCAA', '')
        opp_basic_table_final['School'] = opp_basic_table_final['School'].astype(str)
        opp_basic_table_final['School'] =  opp_basic_table_final['School'].apply(lambda x: x.strip())
        opp_basic_table_final = opp_basic_table_final[opp_basic_table_final.G != 'Overall']
        opp_basic_table_final = opp_basic_table_final[opp_basic_table_final.Rk != 'Rk']
        opp_basic_table_final = opp_basic_table_final[['Season','School','opp three point fg perc','opp_steals','opp_turnovers','opp_rebounds','opp_assists']]
        time.sleep(3)
    time.sleep(20)
    #OPP ADV TABLE
    opp_adv_url = 'https://www.sports-reference.com/cbb/seasons/men/{}-advanced-opponent-stats.html'
    for year in years:
        print('Starting Opponent Advanced Table Year {}'.format(year))
        url = opp_adv_url.format(year)
        data = requests.get(url)
        with open("web_scrape/{}.html".format(year),"w+") as f:
             f.write(data.text)
        with open("web_scrape/{}.html".format(year)) as f:
            page = f.read()
        soup = BeautifulSoup(page,"html.parser")
        soup.find('tr',class_="over_header").decompose()
        soup.find('tr',class_="thead").decompose()
        soup.find('tr',class_="over_header thead").decompose()
        table = soup.find(id="adv_opp_stats")
        opp_adv_table = pd.read_html(str(table))[0]
        opp_adv_table = opp_adv_table.rename(columns={'Pace': 'opp_pace', 'ORtg': 'opp_offensive_rating', 'FTr': 'opp_free_throw_attempt_rate', '3PAr': 'opp_three_point_attempt_rate',
                                                      'TS%': 'opp_true_shooting_percentage','TRB%': 'opp_total_rebound_percentage', 'BLK%': 'opp_block_percentage','eFG%': 'opp_effective_field_goal_percentage',
                                                      'TOV%': 'opp_turnover_percentage', 'FT/FGA': 'opp_free_throws_per_field_goal_attempt','AST%': 'opp_assist_percentage'})
        opp_adv_table['Season']= year
        opp_adv_table['School']=  opp_adv_table['School'].str.upper()
        opp_adv_table_final = opp_adv_table_final.append(opp_adv_table, ignore_index=True)
        opp_adv_table_final['School'] = opp_adv_table_final['School'].str.replace('NCAA', '')
        opp_adv_table_final['School'] = opp_adv_table_final['School'].astype(str)
        opp_adv_table_final['School'] =  opp_adv_table_final['School'].apply(lambda x: x.strip())
        opp_adv_table_final = opp_adv_table_final[opp_adv_table_final.G != 'Overall']
        opp_adv_table_final = opp_adv_table_final[opp_adv_table_final.Rk != 'Rk']
        opp_adv_table_final = opp_adv_table_final[['Season','School','opp_pace', 'opp_offensive_rating',
           'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
           'opp_true_shooting_percentage', 'opp_total_rebound_percentage',
           'opp_block_percentage', 'opp_effective_field_goal_percentage',
           'opp_turnover_percentage', 'opp_free_throws_per_field_goal_attempt','opp_assist_percentage']]
        time.sleep(3)
    #Merge Together
    df1 = pd.merge(basic_table_final, adv_table_final, on=['Season', 'School'])
    df2 = pd.merge(df1, opp_basic_table_final, on=['Season', 'School'])
    teams_df = pd.merge(df2, opp_adv_table_final, on=['Season', 'School'])
    teams_df[['G', 'Tm.', 'Opp.', 'free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'total_rebound_percentage', 'block_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'free_throws_per_field_goal_attempt', 'assist_percentage',
       'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage', 'opp_total_rebound_percentage',
       'opp_block_percentage', 'opp_effective_field_goal_percentage',
       'opp_turnover_percentage', 'opp_free_throws_per_field_goal_attempt',
       'opp_assist_percentage']] = teams_df[['G', 'Tm.', 'Opp.', 'free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'total_rebound_percentage', 'block_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'free_throws_per_field_goal_attempt', 'assist_percentage',
       'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage', 'opp_total_rebound_percentage',
       'opp_block_percentage', 'opp_effective_field_goal_percentage',
       'opp_turnover_percentage', 'opp_free_throws_per_field_goal_attempt',
       'opp_assist_percentage']].apply(pd.to_numeric, errors='coerce')
    teams_df = teams_df.rename(columns={'School': 'abbreviation'})
    teams_df = teams_df.drop(['Tm.','Opp.'], axis=1)
    teams_df['turnover_margin'] = teams_df['turnovers']- teams_df['opp_turnovers']
    teams_df['assist_percentage'] =teams_df['assist_percentage']/100
    teams_df['turnover_percentage'] =teams_df['turnover_percentage']/100
    teams_df['total_rebound_percentage'] =teams_df['total_rebound_percentage']/100
    teams_df['opp_assist_percentage'] =teams_df['opp_assist_percentage']/100
    teams_df['opp_turnover_percentage'] =teams_df['opp_turnover_percentage']/100
    teams_df['opp_total_rebound_percentage'] =teams_df['opp_total_rebound_percentage']/100
    teams_df['assist_to_turnover_ratio'] =teams_df['assists']/ teams_df['turnovers']
    teams_df['steal_margin'] = teams_df['steals']- teams_df['opp_steals']
    teams_df['rebound_margin'] = teams_df['rebounds']- teams_df['opp_rebounds']
    teams_df['free_throw_per_fg_margin'] = teams_df['free_throws_per_field_goal_attempt']- teams_df['opp_free_throws_per_field_goal_attempt']
    teams_df['four_factor_score'] = (0.4*teams_df['effective_field_goal_percentage'])+(0.25*(1-teams_df['turnover_percentage']))
    +(0.2*teams_df['total_rebound_percentage'])+(0.15*teams_df['free_throws_per_field_goal_attempt'])

    teams_df['opp_four_factor_score'] = (0.4*teams_df['opp_effective_field_goal_percentage'])+(0.25*(1-teams_df['opp_turnover_percentage']))
    +(0.2*teams_df['opp_total_rebound_percentage'])+(0.15*teams_df['opp_free_throws_per_field_goal_attempt'])


    return teams_df

In [4]:
teams_df = load_teams(2015,2024)

Starting Basic Table Year 2015
Starting Basic Table Year 2016
Starting Basic Table Year 2017
Starting Basic Table Year 2018
Starting Basic Table Year 2019
2020 skipped for Covid-19
Starting Basic Table Year 2021
Starting Basic Table Year 2022
Starting Basic Table Year 2023
Starting Advanced Table Year 2015
Starting Advanced Table Year 2016
Starting Advanced Table Year 2017
Starting Advanced Table Year 2018
Starting Advanced Table Year 2019
Starting Advanced Table Year 2020
Starting Advanced Table Year 2021
Starting Advanced Table Year 2022
Starting Advanced Table Year 2023
Starting Opponent Basic Table Year 2015
Starting Opponent Basic Table Year 2016
Starting Opponent Basic Table Year 2017
Starting Opponent Basic Table Year 2018
Starting Opponent Basic Table Year 2019
Starting Opponent Basic Table Year 2020
Starting Opponent Basic Table Year 2021
Starting Opponent Basic Table Year 2022
Starting Opponent Basic Table Year 2023
Starting Opponent Advanced Table Year 2015
Starting Opponent

In [5]:
teams_df.columns

Index(['Season', 'abbreviation', 'G', 'free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'total_rebound_percentage', 'block_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'free_throws_per_field_goal_attempt', 'assist_percentage',
       'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage', 'opp_total_rebound_percentage',
       'opp_block_percentage', 'opp_effective_field_goal_percentage',
       'opp_turnover_percentage', 'opp_free_throws_per_field_goal_attempt',
       'opp_assist_percentage', 'turnover_margin', 'assist_to_turnov

## Four Factor Score

https://www.nbastuffer.com/analytics101/four-factors/

In [6]:
teams_df

Unnamed: 0,Season,abbreviation,G,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,...,opp_turnover_percentage,opp_free_throws_per_field_goal_attempt,opp_assist_percentage,turnover_margin,assist_to_turnover_ratio,steal_margin,rebound_margin,free_throw_per_fg_margin,four_factor_score,opp_four_factor_score
0,2015,ABILENE CHRISTIAN,31,0.727,0.380,-17.20,-6.774194,203,399,897,...,0.192,0.325,0.501,-53,,23,-216,-0.115,0.39980,0.41760
1,2015,AIR FORCE,31,0.659,0.371,-1.85,0.645161,195,366,930,...,0.173,0.240,0.643,-17,,12,-24,-0.035,0.42695,0.41555
2,2015,AKRON,35,0.658,0.350,3.65,5.114286,230,422,1249,...,0.167,0.278,0.488,-21,,-12,26,-0.073,0.41020,0.39025
3,2015,ALABAMA,34,0.717,0.317,10.52,2.941176,219,407,1122,...,0.164,0.277,0.543,-9,,44,-15,0.036,0.41100,0.39740
4,2015,ALABAMA A&M,29,0.645,0.288,-17.15,-6.586207,165,413,998,...,0.154,0.259,0.476,69,,-38,-20,0.012,0.38465,0.41110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2814,2023,WRIGHT STATE,33,0.737,0.357,-2.34,6.212121,207,419,1212,...,0.157,0.182,0.416,-15,1.181384,-55,74,-0.004,0.43015,0.41035
2815,2023,WYOMING,31,0.711,0.349,2.02,-2.903226,151,371,994,...,0.149,0.227,0.489,25,0.929919,-39,-30,0.009,0.41800,0.42515
2816,2023,XAVIER,37,0.710,0.390,16.03,6.864865,236,459,1396,...,0.144,0.194,0.536,25,1.535948,-22,201,0.029,0.43330,0.41520
2817,2023,YALE,30,0.694,0.361,7.81,12.533333,191,335,1133,...,0.165,0.227,0.504,-43,1.310448,17,176,-0.019,0.43025,0.39315


In [7]:
teams_df.columns

Index(['Season', 'abbreviation', 'G', 'free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'total_rebound_percentage', 'block_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'free_throws_per_field_goal_attempt', 'assist_percentage',
       'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage', 'opp_total_rebound_percentage',
       'opp_block_percentage', 'opp_effective_field_goal_percentage',
       'opp_turnover_percentage', 'opp_free_throws_per_field_goal_attempt',
       'opp_assist_percentage', 'turnover_margin', 'assist_to_turnov

## METRICS

- effective fg perc
- block perc
- free throw attempt rate
- free throw percentage
- pace
- steal perc
- three point attempt rate
- three point fg perc
- total rebound perc
- true shooting perc
- turnover perc
- 2 pt fg perc
- AdjustO	
- AdjustD	
- AdjustT	
- SOS pyth
- NCSOS Pyth
- Pyth
- massey ordinals
- elo





In [8]:
elo = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\season_elos.csv")

In [9]:
elo.head()

Unnamed: 0,team_id,season,season_elo
0,1101,2014,1330.426954
1,1101,2015,1215.475143
2,1101,2016,1222.173273
3,1101,2017,1238.537655
4,1101,2018,1295.418922


In [10]:
elo['season'].max()

2024

In [11]:
kenpom = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\kenpom.csv")

In [12]:
kenpom

Unnamed: 0,Year,Rank,Team,Conference,Wins,Losses,Seed,AdjEM,AdjustO,AdjustO Rank,...,Luck,Luck Rank,SOS AdjEM,SOS AdjEM Rank,SOS OppO,SOS OppO Rank,SOS OppD,SOS OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
0,2011,1,Ohio St.,B10,29,2,,0.9824,125.4,2,...,0.043,72,0.7340,26,107.4,17,98.3,46,0.4219,245
1,2011,2,Duke,ACC,27,4,,0.9720,118.8,6,...,0.006,171,0.7254,31,106.0,34,97.4,19,0.6092,71
2,2011,3,Kansas,B12,30,2,,0.9711,119.8,4,...,0.071,30,0.6956,42,106.1,33,98.7,59,0.4944,178
3,2011,4,Texas,B12,26,6,,0.9657,114.0,23,...,-0.055,297,0.7080,38,105.6,43,97.8,31,0.5844,85
4,2011,5,Purdue,B10,25,6,,0.9641,116.1,12,...,-0.004,191,0.7713,13,108.1,13,97.3,14,0.5076,166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4580,2024,358,Stonehill,NEC,4,27,,-22.8900,91.9,354,...,-0.078,334,-4.9200,303,102.2,320,107.2,227,4.2800,82
4581,2024,359,Pacific,WCC,6,26,,-22.9100,95.2,340,...,-0.014,224,-0.0400,165,106.3,173,106.3,178,-4.9000,313
4582,2024,360,IUPUI,Horz,6,26,,-25.6300,92.1,353,...,-0.019,238,-2.9700,254,106.2,177,109.2,341,-4.1800,303
4583,2024,361,Coppin St.,MEAC,2,27,,-25.8600,85.6,361,...,-0.057,310,-5.1300,309,102.8,309,107.9,276,0.7200,172


In [13]:
kenpom.columns

Index(['Year', 'Rank', 'Team', 'Conference', 'Wins', 'Losses', 'Seed', 'AdjEM',
       'AdjustO', 'AdjustO Rank', 'AdjustD', 'AdjustD Rank', 'AdjustT',
       'AdjustT Rank', 'Luck', 'Luck Rank', 'SOS AdjEM', 'SOS AdjEM Rank',
       'SOS OppO', 'SOS OppO Rank', 'SOS OppD', 'SOS OppD Rank', 'NCSOS AdjEM',
       'NCSOS AdjEM Rank'],
      dtype='object')

In [14]:
kenpom['Year'].max()

2024

In [15]:
kenpom = kenpom[['Year','Team','Rank','AdjustO Rank','AdjustD Rank', 'SOS AdjEM Rank']]

In [16]:
kenpom

Unnamed: 0,Year,Team,Rank,AdjustO Rank,AdjustD Rank,SOS AdjEM Rank
0,2011,Ohio St.,1,2,10,26
1,2011,Duke,2,6,6,31
2,2011,Kansas,3,4,9,42
3,2011,Texas,4,23,1,38
4,2011,Purdue,5,12,5,13
...,...,...,...,...,...,...
4580,2024,Stonehill,358,354,344,303
4581,2024,Pacific,359,340,359,165
4582,2024,IUPUI,360,353,358,254
4583,2024,Coppin St.,361,361,289,309


In [17]:
seeds_df = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\MDataFiles_Stage1\MNCAATourneySeeds.csv")

In [18]:
seeds_df['Seed'] = pd.to_numeric(seeds_df['Seed'].str.replace("[WXYZab]",""))

  """Entry point for launching an IPython kernel.


In [19]:
seeds_df['Season'].max()

2024

In [20]:
## MASSEY Ordinalas

In [21]:
# massey_df = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\MDataFiles_Stage1\MMasseyOrdinals_thruSeason2024_day128.csv")
# #massey_dfTY = massey_df.loc[massey_df['Season'] == 2023]

# #Ranking at the end of season versus end of tourney (as shown in KP data)
# SAG_df_else = massey_df[massey_df['SystemName'].str.contains("NET")]
# SAG_end_df_else = SAG_df_else.loc[SAG_df_else['RankingDayNum'] == 128]
# SAG_end_df = SAG_end_df_else
# SAG_end_df = SAG_end_df[['Season','TeamID','OrdinalRank']]
# SAG_end_df.rename(columns={'OrdinalRank': 'RankSAG'}, inplace=True)

In [22]:
regular_results = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\MDataFiles_Stage1\MRegularSeasonDetailedResults.csv")

In [23]:
def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

regular_data = prepare_data(regular_results)


In [24]:
regular_data

Unnamed: 0,Season,DayNum,T1_TeamID,T1_Score,T2_TeamID,T2_Score,location,NumOT,T1_FGM,T1_FGA,...,T2_FTM,T2_FTA,T2_OR,T2_DR,T2_Ast,T2_TO,T2_Stl,T2_Blk,T2_PF,PointDiff
0,2003,10,1104,68,1328,62,0,0,27,58,...,16,22,10,22,8,18,9,2,20,6
1,2003,10,1272,70,1393,63,0,0,26,62,...,9,20,20,25,7,12,8,6,16,7
2,2003,11,1266,73,1437,61,0,0,24,58,...,14,23,31,22,9,12,2,5,23,12
3,2003,11,1296,56,1457,50,0,0,18,38,...,8,15,17,20,9,19,4,3,23,6
4,2003,11,1400,77,1208,71,0,0,30,61,...,17,27,21,15,12,10,7,1,14,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226477,2024,132,1196,67,1120,86,0,0,24,66,...,18,22,6,33,18,12,9,10,20,-19
226478,2024,132,1433,51,1182,57,0,0,17,47,...,15,21,19,23,13,13,12,2,14,-6
226479,2024,132,1458,87,1228,93,0,0,30,64,...,26,30,13,24,14,9,2,5,19,-6
226480,2024,132,1396,69,1412,85,0,0,25,61,...,15,22,9,27,19,8,13,5,13,-16


In [25]:
# Filter data for the last 14 days
last_14_days_data = regular_data[regular_data['DayNum'] > 118]

# Create columns indicating whether each team won the game
last_14_days_data['T1_Win'] = np.where(last_14_days_data['T1_Score'] > last_14_days_data['T2_Score'], 1, 0)
last_14_days_data['T2_Win'] = np.where(last_14_days_data['T2_Score'] > last_14_days_data['T1_Score'], 1, 0)

# Calculate win percentage for each team regardless of whether they played as Team 1 or Team 2
win_percentages_T1 = last_14_days_data.groupby(['Season', 'T1_TeamID'])['T1_Win'].mean().reset_index(name='Win_Percentage')
win_percentages_T2 = last_14_days_data.groupby(['Season', 'T2_TeamID'])['T2_Win'].mean().reset_index(name='Win_Percentage')

# Rename columns for clarity
win_percentages_T1.rename(columns={'T1_TeamID': 'TeamID'}, inplace=True)
win_percentages_T2.rename(columns={'T2_TeamID': 'TeamID'}, inplace=True)

# Concatenate win percentages for both teams
team_win_percentages = pd.concat([win_percentages_T1, win_percentages_T2], ignore_index=True)

# Group by Season and TeamID to get total win percentage for each team in the last 14 days
L14_win_perc = team_win_percentages.groupby(['Season', 'TeamID'])['Win_Percentage'].mean().reset_index(name='L14_win_perc')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
# LEF_df_else = massey_df[massey_df['SystemName'].str.contains("LEF")]
# LEF_end_df_else = LEF_df_else.loc[LEF_df_else['RankingDayNum'] == 133]
# LEF_end_df = LEF_end_df_else
# LEF_end_df = LEF_end_df[['Season','TeamID','OrdinalRank']]
# LEF_end_df.rename(columns={'OrdinalRank': 'RankLEF'}, inplace=True)

In [27]:
# Join all data into master dataset to get all the Xs

In [28]:
teams_df = teams_df.fillna(teams_df.mean())

  """Entry point for launching an IPython kernel.


In [29]:
teams_df.head()

Unnamed: 0,Season,abbreviation,G,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,...,opp_turnover_percentage,opp_free_throws_per_field_goal_attempt,opp_assist_percentage,turnover_margin,assist_to_turnover_ratio,steal_margin,rebound_margin,free_throw_per_fg_margin,four_factor_score,opp_four_factor_score
0,2015,ABILENE CHRISTIAN,31,0.727,0.38,-17.2,-6.774194,203,399,897,...,0.192,0.325,0.501,-53,1.061567,23,-216,-0.115,0.3998,0.4176
1,2015,AIR FORCE,31,0.659,0.371,-1.85,0.645161,195,366,930,...,0.173,0.24,0.643,-17,1.061567,12,-24,-0.035,0.42695,0.41555
2,2015,AKRON,35,0.658,0.35,3.65,5.114286,230,422,1249,...,0.167,0.278,0.488,-21,1.061567,-12,26,-0.073,0.4102,0.39025
3,2015,ALABAMA,34,0.717,0.317,10.52,2.941176,219,407,1122,...,0.164,0.277,0.543,-9,1.061567,44,-15,0.036,0.411,0.3974
4,2015,ALABAMA A&M,29,0.645,0.288,-17.15,-6.586207,165,413,998,...,0.154,0.259,0.476,69,1.061567,-38,-20,0.012,0.38465,0.4111


In [30]:
# Bring in key

In [31]:
team_abb = pd.read_csv(r"C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\team key.csv")
team_abb = team_abb[['abbreviation','TeamID','kenpom','scrape']]

In [85]:
team_data = teams_df[['Season', 'abbreviation', 'free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'four_factor_score',  'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage',
        'opp_four_factor_score','assist_to_turnover_ratio',
        'free_throw_per_fg_margin','rebound_margin','steal_margin','turnover_margin']]

In [86]:
team_data

Unnamed: 0,Season,abbreviation,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,assists,...,opp_offensive_rating,opp_free_throw_attempt_rate,opp_three_point_attempt_rate,opp_true_shooting_percentage,opp_four_factor_score,assist_to_turnover_ratio,free_throw_per_fg_margin,rebound_margin,steal_margin,turnover_margin
0,2015,ABILENE CHRISTIAN,0.727,0.380,-17.20,-6.774194,203,399,897,426.816921,...,107.0,0.473,0.327,0.573,0.41760,1.061567,-0.115,-216,23,-53
1,2015,AIR FORCE,0.659,0.371,-1.85,0.645161,195,366,930,426.816921,...,106.1,0.349,0.444,0.551,0.41555,1.061567,-0.035,-24,12,-17
2,2015,AKRON,0.658,0.350,3.65,5.114286,230,422,1249,426.816921,...,96.9,0.410,0.310,0.498,0.39025,1.061567,-0.073,26,-12,-21
3,2015,ALABAMA,0.717,0.317,10.52,2.941176,219,407,1122,426.816921,...,100.8,0.392,0.370,0.513,0.39740,1.061567,0.036,-15,44,-9
4,2015,ALABAMA A&M,0.645,0.288,-17.15,-6.586207,165,413,998,426.816921,...,105.8,0.412,0.293,0.526,0.41110,1.061567,0.012,-20,-38,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2814,2023,WRIGHT STATE,0.737,0.357,-2.34,6.212121,207,419,1212,495.000000,...,100.1,0.269,0.387,0.523,0.41035,1.181384,-0.004,74,-55,-15
2815,2023,WYOMING,0.711,0.349,2.02,-2.903226,151,371,994,345.000000,...,108.1,0.289,0.395,0.567,0.42515,0.929919,0.009,-30,-39,25
2816,2023,XAVIER,0.710,0.390,16.03,6.864865,236,459,1396,705.000000,...,102.6,0.270,0.335,0.532,0.41520,1.535948,0.029,201,-22,25
2817,2023,YALE,0.694,0.361,7.81,12.533333,191,335,1133,439.000000,...,92.9,0.327,0.408,0.498,0.39315,1.310448,-0.019,176,17,-43


In [87]:
team_data.shape

(2819, 34)

In [88]:
team_data2 = team_data.merge(team_abb,  left_on = ['abbreviation'],right_on = ['scrape'], how='left')
team_data2 = team_data2.fillna(0)
team_data2['TeamID'] = team_data2['TeamID'].astype(int)

In [89]:
team_data2 = team_data2.drop(columns=['abbreviation_x','abbreviation_y'])

In [90]:
# Merge elo
team_data2 = team_data2.merge(elo, left_on = ['TeamID','Season'], right_on = ['team_id','season'], how = 'left')
team_data2 = team_data2.drop(columns=['team_id','season'])
# Merge seed
team_data2 = team_data2.merge(seeds_df, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')

#Merge Kenpom Data
team_data2 = team_data2.merge(kenpom, left_on = ['kenpom','Season'], right_on = ['Team','Year'], how = 'left')
team_data2 = team_data2.drop(columns=['Team','Year','kenpom','scrape'])

#Last 14 days win percentage
team_data2 = team_data2.merge(L14_win_perc, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')


#SAG Rating
# team_data2 = team_data2.merge(SAG_end_df, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')

In [91]:
team_data2.head()

Unnamed: 0,Season,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,assists,pace,...,steal_margin,turnover_margin,TeamID,season_elo,Seed,Rank,AdjustO Rank,AdjustD Rank,SOS AdjEM Rank,L14_win_perc
0,2015,0.727,0.38,-17.2,-6.774194,203,399,897,426.816921,65.9,...,23,-53,1101,1215.475143,,332.0,319.0,324.0,281.0,0.333333
1,2015,0.659,0.371,-1.85,0.645161,195,366,930,426.816921,61.1,...,12,-17,1102,1497.946052,,203.0,88.0,308.0,186.0,0.333333
2,2015,0.658,0.35,3.65,5.114286,230,422,1249,426.816921,64.7,...,-12,-21,1103,1623.98151,,92.0,157.0,68.0,165.0,0.5
3,2015,0.717,0.317,10.52,2.941176,219,407,1122,426.816921,63.1,...,44,-9,1104,1735.039784,,61.0,60.0,93.0,31.0,0.333333
4,2015,0.645,0.288,-17.15,-6.586207,165,413,998,426.816921,64.5,...,-38,69,1105,1133.438732,,330.0,306.0,335.0,347.0,0.25


In [92]:
tournament_data = pd.read_csv(r'C:\Users\socst\Documents\Python Scripts\NCAAB-master\2024 MM\Data\MDataFiles_Stage1\MNCAATourneyDetailedResults.csv')

In [93]:
late_round_td = tournament_data[tournament_data['DayNum']>140]

In [94]:
# tourney_data = []

# for team_id in team_abb['TeamID']:
#     team_data = []
#     winning_team_tourney_data = tournament_data[tournament_data['WTeamID'] == team_id][['Season', 'WTeamID', 'WScore']]
#     winning_team_tourney_data.columns = ['Season', 'TeamID', 'Score']
#     winning_team_tourney_data = winning_team_tourney_data.merge(team_data2, left_on=['Season', 'TeamID'], right_on=['Season', 'TeamID'])
#     tourney_data.append(winning_team_tourney_data)
    
    
    
#     losing_team_tourney_data = tournament_data[tournament_data['LTeamID'] == team_id][['Season', 'LTeamID', 'LScore']]
#     losing_team_tourney_data.columns = ['Season', 'TeamID', 'Score']

#     losing_team_tourney_data = losing_team_tourney_data.merge(team_data2, left_on=['Season', 'TeamID'], right_on=['Season', 'TeamID'])
#     tourney_data.append(losing_team_tourney_data)

In [95]:
# tourney_model_data = pd.concat(tourney_data)
# tourney_model_data.sort_values('Season', inplace=True)

In [96]:
# tourney_model_data.head()

In [97]:
team_data2

Unnamed: 0,Season,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,assists,pace,...,steal_margin,turnover_margin,TeamID,season_elo,Seed,Rank,AdjustO Rank,AdjustD Rank,SOS AdjEM Rank,L14_win_perc
0,2015,0.727,0.380,-17.20,-6.774194,203,399,897,426.816921,65.9,...,23,-53,1101,1215.475143,,332.0,319.0,324.0,281.0,0.333333
1,2015,0.659,0.371,-1.85,0.645161,195,366,930,426.816921,61.1,...,12,-17,1102,1497.946052,,203.0,88.0,308.0,186.0,0.333333
2,2015,0.658,0.350,3.65,5.114286,230,422,1249,426.816921,64.7,...,-12,-21,1103,1623.981510,,92.0,157.0,68.0,165.0,0.500000
3,2015,0.717,0.317,10.52,2.941176,219,407,1122,426.816921,63.1,...,44,-9,1104,1735.039784,,61.0,60.0,93.0,31.0,0.333333
4,2015,0.645,0.288,-17.15,-6.586207,165,413,998,426.816921,64.5,...,-38,69,1105,1133.438732,,330.0,306.0,335.0,347.0,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2814,2023,0.737,0.357,-2.34,6.212121,207,419,1212,495.000000,72.0,...,-55,-15,1460,1517.661803,,193.0,186.0,189.0,307.0,0.500000
2815,2023,0.711,0.349,2.02,-2.903226,151,371,994,345.000000,66.4,...,-39,25,1461,1530.989339,,153.0,121.0,207.0,62.0,0.333333
2816,2023,0.710,0.390,16.03,6.864865,236,459,1396,705.000000,72.0,...,-22,25,1462,1897.768312,3.0,16.0,9.0,70.0,23.0,0.800000
2817,2023,0.694,0.361,7.81,12.533333,191,335,1133,439.000000,67.5,...,17,-43,1463,1686.305258,,67.0,81.0,56.0,162.0,0.666667


In [98]:
log_tourney_data = tournament_data[['Season', 'WTeamID', 'LTeamID','WScore','LScore']]
log_tourney_data_late = late_round_td[['Season', 'WTeamID', 'LTeamID','WScore','LScore']]

In [99]:
team1_tourney_data = log_tourney_data.merge(team_data2, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID']).drop(columns=['TeamID'])
team1_tourney_data = team1_tourney_data.merge(team_data2, left_on=['Season', 'LTeamID'], right_on = ['Season', 'TeamID'], suffixes=['', '_Opponent']).drop(columns=['TeamID'])
team1_tourney_data['Seed_Diff'] = team1_tourney_data['Seed_Opponent'] - team1_tourney_data['Seed']


team1_tourney_data['Win'] = 1
team1_tourney_data['Spread'] = team1_tourney_data['WScore']-team1_tourney_data['LScore']


team2_tourney_data = log_tourney_data.merge(team_data2, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
team2_tourney_data = team2_tourney_data.merge(team_data2, left_on=['Season', 'WTeamID'], right_on = ['Season', 'TeamID'], suffixes=['', '_Opponent'])
team2_tourney_data['Seed_Diff'] = team2_tourney_data['Seed_Opponent'] - team2_tourney_data['Seed']


team2_tourney_data['Win'] = 0
team2_tourney_data['Spread'] = team2_tourney_data['LScore']-team2_tourney_data['WScore']

combined_tourney_data = pd.concat([team1_tourney_data, team2_tourney_data]).drop(columns=['TeamID','TeamID_Opponent'])



#Late Round
team1_tourney_data_late = log_tourney_data_late.merge(team_data2, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID']).drop(columns=['TeamID'])
team1_tourney_data_late = team1_tourney_data_late.merge(team_data2, left_on=['Season', 'LTeamID'], right_on = ['Season', 'TeamID'], suffixes=['', '_Opponent']).drop(columns=['TeamID'])
team1_tourney_data_late['Seed_Diff'] = team1_tourney_data_late['Seed_Opponent'] - team1_tourney_data_late['Seed']


team1_tourney_data_late['Win'] = 1
team1_tourney_data_late['Spread'] = team1_tourney_data_late['WScore']-team1_tourney_data['LScore']


team2_tourney_data_late = log_tourney_data_late.merge(team_data2, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
team2_tourney_data_late = team2_tourney_data_late.merge(team_data2, left_on=['Season', 'WTeamID'], right_on = ['Season', 'TeamID'], suffixes=['', '_Opponent'])
team2_tourney_data_late['Seed_Diff'] = team2_tourney_data_late['Seed_Opponent'] - team2_tourney_data_late['Seed']


team2_tourney_data_late['Win'] = 0
team2_tourney_data_late['Spread'] = team2_tourney_data_late['LScore']-team2_tourney_data_late['WScore']

combined_tourney_data_late = pd.concat([team1_tourney_data_late, team2_tourney_data_late]).drop(columns=['TeamID','TeamID_Opponent'])

In [100]:
combined_tourney_data.head()

Unnamed: 0,Season,WTeamID,LTeamID,WScore,LScore,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,...,season_elo_Opponent,Seed_Opponent,Rank_Opponent,AdjustO Rank_Opponent,AdjustD Rank_Opponent,SOS AdjEM Rank_Opponent,L14_win_perc_Opponent,Seed_Diff,Win,Spread
0,2015,1214,1264,74,64,0.652,0.304,-8.25,-1.171429,235,...,1590.325926,16.0,145.0,196.0,109.0,257.0,1.0,0.0,1,10
1,2015,1279,1140,94,90,0.773,0.332,11.39,4.382353,225,...,1818.631645,11.0,31.0,9.0,139.0,77.0,0.666667,0.0,1,4
2,2015,1173,1129,56,55,0.686,0.357,10.98,6.916667,250,...,1787.452505,11.0,39.0,48.0,64.0,131.0,0.75,0.0,1,1
3,2015,1173,1344,66,53,0.686,0.357,10.98,6.916667,250,...,1823.856197,6.0,28.0,38.0,48.0,8.0,0.5,-5.0,1,13
4,2015,1352,1316,81,77,0.709,0.375,-3.02,0.485714,300,...,1490.238442,16.0,127.0,98.0,178.0,314.0,1.0,0.0,1,4


In [101]:
for column in combined_tourney_data.columns:
    print(column)

Season
WTeamID
LTeamID
WScore
LScore
free_throw_percentage
three_point_field_goal_percentage
simple_rating_system
avg_mov
steals
turnovers
rebounds
assists
pace
offensive_rating
free_throw_attempt_rate
three_point_attempt_rate
true_shooting_percentage
effective_field_goal_percentage
turnover_percentage
four_factor_score
opp three point fg perc
opp_steals
opp_turnovers
opp_rebounds
opp_assists
opp_pace
opp_offensive_rating
opp_free_throw_attempt_rate
opp_three_point_attempt_rate
opp_true_shooting_percentage
opp_four_factor_score
assist_to_turnover_ratio
free_throw_per_fg_margin
rebound_margin
steal_margin
turnover_margin
season_elo
Seed
Rank
AdjustO Rank
AdjustD Rank
SOS AdjEM Rank
L14_win_perc
free_throw_percentage_Opponent
three_point_field_goal_percentage_Opponent
simple_rating_system_Opponent
avg_mov_Opponent
steals_Opponent
turnovers_Opponent
rebounds_Opponent
assists_Opponent
pace_Opponent
offensive_rating_Opponent
free_throw_attempt_rate_Opponent
three_point_attempt_rate_Opponent

## Prepare for Modeling

In [102]:
X = combined_tourney_data.drop(['Win', 'Seed', 'Seed_Opponent','WTeamID','LTeamID','WScore','LScore','Season','Spread'], axis=1)
X_late = combined_tourney_data_late.drop(['Win', 'Seed', 'Seed_Opponent','WTeamID','LTeamID','WScore','LScore','Season','Spread'], axis=1)


In [103]:
for column in X.columns:
    print(column)

free_throw_percentage
three_point_field_goal_percentage
simple_rating_system
avg_mov
steals
turnovers
rebounds
assists
pace
offensive_rating
free_throw_attempt_rate
three_point_attempt_rate
true_shooting_percentage
effective_field_goal_percentage
turnover_percentage
four_factor_score
opp three point fg perc
opp_steals
opp_turnovers
opp_rebounds
opp_assists
opp_pace
opp_offensive_rating
opp_free_throw_attempt_rate
opp_three_point_attempt_rate
opp_true_shooting_percentage
opp_four_factor_score
assist_to_turnover_ratio
free_throw_per_fg_margin
rebound_margin
steal_margin
turnover_margin
season_elo
Rank
AdjustO Rank
AdjustD Rank
SOS AdjEM Rank
L14_win_perc
free_throw_percentage_Opponent
three_point_field_goal_percentage_Opponent
simple_rating_system_Opponent
avg_mov_Opponent
steals_Opponent
turnovers_Opponent
rebounds_Opponent
assists_Opponent
pace_Opponent
offensive_rating_Opponent
free_throw_attempt_rate_Opponent
three_point_attempt_rate_Opponent
true_shooting_percentage_Opponent
effecti

In [104]:
X

Unnamed: 0,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,assists,pace,offensive_rating,...,rebound_margin_Opponent,steal_margin_Opponent,turnover_margin_Opponent,season_elo_Opponent,Rank_Opponent,AdjustO Rank_Opponent,AdjustD Rank_Opponent,SOS AdjEM Rank_Opponent,L14_win_perc_Opponent,Seed_Diff
0,0.652,0.304,-8.25,-1.171429,235,494,1292,426.816921,69.5,96.1,...,-63,37,-64,1590.325926,145.0,196.0,109.0,257.0,1.000000,0.0
1,0.773,0.332,11.39,4.382353,225,378,1284,426.816921,67.0,107.9,...,164,26,-39,1818.631645,31.0,9.0,139.0,77.0,0.666667,0.0
2,0.686,0.357,10.98,6.916667,250,415,1121,426.816921,64.2,105.1,...,89,44,-82,1787.452505,39.0,48.0,64.0,131.0,0.750000,0.0
3,0.686,0.357,10.98,6.916667,250,415,1121,426.816921,64.2,105.1,...,135,46,-35,1823.856197,28.0,38.0,48.0,8.0,0.500000,-5.0
4,0.709,0.375,-3.02,0.485714,300,467,1162,426.816921,66.7,102.6,...,32,-15,11,1490.238442,127.0,98.0,178.0,314.0,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,0.718,0.353,17.07,9.055556,330,386,1146,624.000000,69.3,112.9,...,103,-33,37,1880.827515,32.0,41.0,40.0,7.0,0.666667,5.0
531,0.696,0.348,9.89,7.942857,311,468,1161,456.000000,68.3,103.8,...,236,73,-63,1884.119806,11.0,40.0,9.0,73.0,0.500000,-7.0
532,0.689,0.322,11.29,3.111111,255,422,1315,515.000000,69.0,101.9,...,33,59,-123,1848.066397,27.0,53.0,20.0,11.0,0.500000,-5.0
533,0.661,0.365,0.60,6.028571,276,434,1234,502.000000,69.8,106.4,...,201,-22,25,1897.768312,16.0,9.0,70.0,23.0,0.800000,-11.0


In [105]:
y = combined_tourney_data['Win']
y_late = combined_tourney_data_late['Win']

In [106]:
y

0      1
1      1
2      1
3      1
4      1
      ..
530    0
531    0
532    0
533    0
534    0
Name: Win, Length: 1070, dtype: int64

In [107]:
#Correlation

In [108]:
X = X.fillna(X.median())
X_late = X_late.fillna(X_late.median())

In [109]:
# Concatenate X and y into a single DataFrame
data = pd.concat([X, y], axis=1)

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Extract the correlation of each feature with the target variable
correlation_with_y = correlation_matrix['Win']

sorted_correlation = correlation_with_y.abs().sort_values(ascending=False)

# Display the correlation of each feature with the target variable
for feature, correlation in sorted_correlation.items():
    print(f"{feature}: {correlation}")

Win: 1.0
Seed_Diff: 0.4481443693020351
simple_rating_system_Opponent: 0.3699742059307836
simple_rating_system: 0.3699742059307805
season_elo_Opponent: 0.34680460796675516
season_elo: 0.3468046079667536
Rank: 0.2801462477887258
Rank_Opponent: 0.2801462477887255
rebounds: 0.27022477930051825
rebounds_Opponent: 0.270224779300515
SOS AdjEM Rank: 0.26915522626896604
SOS AdjEM Rank_Opponent: 0.2691552262689657
AdjustO Rank: 0.25960662643517424
AdjustO Rank_Opponent: 0.2596066264351737
avg_mov: 0.24381536818889044
avg_mov_Opponent: 0.24381536818888888
AdjustD Rank_Opponent: 0.2323233703772988
AdjustD Rank: 0.23232337037729756
offensive_rating_Opponent: 0.2167796490543608
offensive_rating: 0.2167796490543515
opp_rebounds: 0.19744153935873773
opp_rebounds_Opponent: 0.19744153935873282
steals: 0.16738114127229287
steals_Opponent: 0.16738114127229106
four_factor_score: 0.1641048713717558
four_factor_score_Opponent: 0.16410487137174476
opp_turnovers_Opponent: 0.16012588839130676
opp_turnovers: 0.1

In [110]:
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, make_scorer, brier_score_loss
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Normalize the features (X)
X_norm = scaler.fit_transform(X)

# Store the feature names
feature_names = X.columns.tolist()

# Define feature weights
feature_weights = {
    'season_elo': 1.2,
    'Seed_Diff': 1.3,
    'season_elo_Opponent': 1.3,
    'Rank': 1.2,
    'Rank_Opponent': 1.5,
    'AdjustD Rank': 1.3,
    'AdjustO Rank': 1.3,
    'AdjustD Rank_Opponent': 1.2,
    'AdjustO Rank_Opponent': 1.2,
    
    
    
}

# Apply feature weights to the normalized features
X_weighted = X_norm.copy()
for feature, weight in feature_weights.items():
    feature_index = feature_names.index(feature)
    X_weighted[:, feature_index] *= weight

# Split the normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_weighted, y, test_size=0.2, random_state=42)

# Parameters for RandomizedSearchCV
parameters = {'n_neighbors': range(1, 20)}

# Create a list of classifiers
classifiers = [
    ('GradientBoosting', RandomizedSearchCV(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
                                            {'n_estimators': range(100, 1000, 100), 'max_depth': range(1, 10)},
                                            n_iter=10, random_state=42)),
    ('XGBoost', XGBClassifier()),
    ('LogisticRegression', LogisticRegression(random_state=42)),
    ('SVC', SVC(kernel='linear', probability=True, random_state=42)),  # Set probability to True for soft voting
    ('KNeighbors', RandomizedSearchCV(KNeighborsClassifier(), parameters, n_iter=10, random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42)),
    ('NeuralNetwork', MLPClassifier(random_state=42))  # No need for probability parameter
]

# Create a soft voting classifier
voting_classifier = VotingClassifier(estimators=classifiers, voting='soft')

# Perform cross-validation on each classifier and print their accuracies
print("Individual Classifier Accuracies:")
for name, classifier in classifiers:
    cv_scores = cross_val_score(classifier, X_weighted, y, cv=5)
    mean_accuracy = cv_scores.mean()
    print(f'{name} - Mean Accuracy: {mean_accuracy}')

# Perform cross-validation on the voting classifier using Brier score
brier_scorer = make_scorer(brier_score_loss, greater_is_better=False)
y_pred_prob_cv = cross_val_predict(voting_classifier, X_weighted, y, cv=5, method='predict_proba')

# Print the accuracy of the voting classifier
voting_cv_scores = cross_val_score(voting_classifier, X_weighted, y, cv=5)
voting_mean_accuracy = voting_cv_scores.mean()
print(f'Voting Classifier - Mean Accuracy: {voting_mean_accuracy}')



Individual Classifier Accuracies:
GradientBoosting - Mean Accuracy: 0.7523364485981308
XGBoost - Mean Accuracy: 0.7607476635514019
LogisticRegression - Mean Accuracy: 0.8355140186915888


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

SVC - Mean Accuracy: 0.8355140186915888
KNeighbors - Mean Accuracy: 0.7130841121495326
[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10493
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10585
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 428, nu



NeuralNetwork - Mean Accuracy: 0.8214953271028037


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10493
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10585
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10637
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10547
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10323
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10493
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10585
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10637
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10547
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 428, number of negative: 428
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10323
[LightGBM] [Info] Number of data points in the train set: 856, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Voting Classifier - Mean Accuracy: 0.8233644859813085




In [111]:
# from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV, cross_val_score
# from sklearn.metrics import accuracy_score, make_scorer, brier_score_loss
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.neural_network import MLPClassifier
# from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier
# from sklearn.preprocessing import StandardScaler

# # Create a StandardScaler object
# scaler = StandardScaler()

# # Normalize the features (X)
# X_norm = scaler.fit_transform(X_late)

# # Store the feature names
# feature_names = X_late.columns.tolist()

# # Define feature weights
# feature_weights = {
#     'season_elo': 1.3,
#     'Seed_Diff': 1.3,
#     'season_elo_Opponent': 1.3,
#     'Rank': 1.2,
#     'Rank_Opponent': 1.3,
#     'AdjustD Rank': 1.3,
#     'AdjustO Rank': 1.3,
#     'AdjustD Rank_Opponent': 1.3,
#     'AdjustO Rank_Opponent': 1.3,
    
    
    
# }

# # Apply feature weights to the normalized features
# X_weighted = X_norm.copy()
# for feature, weight in feature_weights.items():
#     feature_index = feature_names.index(feature)
#     X_weighted[:, feature_index] *= weight

# # Split the normalized data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_weighted, y_late, test_size=0.2, random_state=42)

# # Parameters for RandomizedSearchCV
# parameters = {'n_neighbors': range(1, 20)}

# # Create a list of classifiers
# classifiers_late = [
#     ('GradientBoosting', RandomizedSearchCV(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
#                                             {'n_estimators': range(100, 1000, 100), 'max_depth': range(1, 10)},
#                                             n_iter=10, random_state=42)),
#     ('XGBoost', XGBClassifier()),
#     ('LogisticRegression', LogisticRegression(random_state=42)),
#     ('SVC', SVC(kernel='linear', probability=True, random_state=42)),  # Set probability to True for soft voting
#     ('KNeighbors', RandomizedSearchCV(KNeighborsClassifier(), parameters, n_iter=10, random_state=42)),
#     ('LightGBM', LGBMClassifier(random_state=42)),
#     ('NeuralNetwork', MLPClassifier(random_state=42))  # No need for probability parameter
# ]

# # Create a soft voting classifier
# voting_classifier_late = VotingClassifier(estimators=classifiers, voting='soft')

# # Perform cross-validation on each classifier and print their accuracies
# print("Individual Classifier Accuracies:")
# for name, classifier in classifiers_late:
#     cv_scores = cross_val_score(classifier, X_weighted, y_late, cv=5)
#     mean_accuracy = cv_scores.mean()
#     print(f'{name} - Mean Accuracy: {mean_accuracy}')

# # Perform cross-validation on the voting classifier using Brier score
# brier_scorer = make_scorer(brier_score_loss, greater_is_better=False)
# y_pred_prob_cv = cross_val_predict(voting_classifier_late, X_weighted, y_late, cv=5, method='predict_proba')

# # Print the accuracy of the voting classifier
# voting_cv_scores = cross_val_score(voting_classifier_late, X_weighted, y_late, cv=5)
# voting_mean_accuracy = voting_cv_scores.mean()
# print(f'Voting Classifier - Mean Accuracy: {voting_mean_accuracy}')


In [112]:
# Train the ensemble model on the entire dataset
voting_classifier.fit(X_weighted, y)

# Initialize a dictionary to store feature importances
feature_importances = {}

# Iterate over each base classifier in the ensemble
for name, clf in voting_classifier.named_estimators_.items():
    # Check if the classifier has attribute feature_importances_
    if hasattr(clf, 'feature_importances_'):
        # Aggregate feature importances across all base classifiers
        for feature, importance in zip(feature_names, clf.feature_importances_):
            if feature in feature_importances:
                feature_importances[feature] += importance
            else:
                feature_importances[feature] = importance

# Sort the features by their importance values in descending order
sorted_feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Print the top features and their importances
print("Top Features and Their Importances:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LightGBM] [Info] Number of positive: 535, number of negative: 535
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11407
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Top Features and Their Importances:
simple_rating_system_Opponent: 140.0274676680565
simple_rating_system: 135.02495764754713
opp_rebounds: 123.01822195574641
pace: 115.01179054379463
opp_rebounds_Opponent: 112.01929598674178
pace_Opponent: 110.01206128578633
rebounds_Opponent: 94.0211423933506
rebounds: 88.02388654090464
turnovers_Opponent: 71.01078098546714
turnovers: 60.012208794243634
free_throw_attempt_rate_Opponent: 58.009733295068145
four_factor_score_Opponent: 55.011724724434316
opp_turnovers_Opponent: 52.01359588652849
opp three point fg perc_Opponen



## Current Data

In [113]:
teams_df_TY = load_teams(2024,2025)

Starting Basic Table Year 2024
Starting Advanced Table Year 2024
Starting Opponent Basic Table Year 2024
Starting Opponent Advanced Table Year 2024


In [114]:
teams_df_TY

Unnamed: 0,Season,abbreviation,G,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,...,opp_turnover_percentage,opp_free_throws_per_field_goal_attempt,opp_assist_percentage,turnover_margin,assist_to_turnover_ratio,steal_margin,rebound_margin,free_throw_per_fg_margin,four_factor_score,opp_four_factor_score
0,2024,ABILENE CHRISTIAN,32,0.731,0.341,-3.82,-0.968750,253,404,1070,...,0.183,0.277,0.466,-74,1.002475,47,-101,0.007,0.40190,0.41105
1,2024,AIR FORCE,31,0.682,0.361,-4.34,-6.193548,202,372,872,...,0.153,0.292,0.499,24,1.212366,24,-144,-0.090,0.42295,0.42855
2,2024,AKRON,34,0.728,0.328,3.01,8.176471,192,389,1252,...,0.151,0.191,0.459,-1,1.149100,-13,156,0.046,0.42365,0.40505
3,2024,ALABAMA,32,0.784,0.365,21.10,9.687500,232,383,1267,...,0.136,0.294,0.451,9,1.344648,-5,142,-0.018,0.44055,0.41560
4,2024,ALABAMA A&M,33,0.719,0.285,-14.66,-7.090909,251,534,1170,...,0.171,0.347,0.543,60,0.644195,-60,-21,-0.003,0.38350,0.40445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,WRIGHT STATE,32,0.757,0.383,0.00,5.250000,200,379,1111,...,0.146,0.208,0.450,-11,1.395778,-35,70,0.040,0.44840,0.43030
358,2024,WYOMING,32,0.741,0.372,0.96,-1.625000,165,408,1103,...,0.143,0.201,0.485,48,0.916667,-96,34,0.088,0.41660,0.42305
359,2024,XAVIER,33,0.744,0.340,12.24,1.303030,231,381,1257,...,0.141,0.215,0.601,0,1.406824,28,72,0.023,0.41125,0.41315
360,2024,YALE,31,0.709,0.347,4.80,8.548387,186,293,1150,...,0.145,0.202,0.579,-46,1.610922,18,148,-0.011,0.43150,0.40735


In [115]:

team_data2_TY = teams_df_TY.merge(team_abb,  left_on = ['abbreviation'],right_on = ['scrape'], how='left')
team_data2_TY = team_data2_TY.fillna(0)
team_data2_TY['TeamID'] = team_data2_TY['TeamID'].astype(int)
team_data2_TY = team_data2_TY.merge(elo, left_on = ['TeamID','Season'], right_on = ['team_id','season'], how = 'left')
team_data2_TY = team_data2_TY.merge(seeds_df, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')

#Last 14 days win percentage
team_data2_TY = team_data2_TY.merge(L14_win_perc, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')
#SAG Rating
#team_data3_TY = team_data2_TY.merge(SAG_end_df, left_on = ['TeamID','Season'], right_on = ['TeamID','Season'], how = 'left')
team_data_abb_TY = team_data2_TY.merge(kenpom, left_on = ['kenpom','Season'], right_on = ['Team','Year'], how = 'left')


In [116]:
team_data_abb_TY = team_data_abb_TY[['Season','TeamID','free_throw_percentage',
       'three_point_field_goal_percentage', 'simple_rating_system', 'avg_mov',
       'steals', 'turnovers', 'rebounds', 'assists', 'pace',
       'offensive_rating', 'free_throw_attempt_rate',
       'three_point_attempt_rate', 'true_shooting_percentage',
       'effective_field_goal_percentage', 'turnover_percentage',
       'four_factor_score',  'opp three point fg perc', 'opp_steals', 'opp_turnovers',
       'opp_rebounds', 'opp_assists', 'opp_pace', 'opp_offensive_rating',
       'opp_free_throw_attempt_rate', 'opp_three_point_attempt_rate',
       'opp_true_shooting_percentage',
        'opp_four_factor_score','assist_to_turnover_ratio',
        'free_throw_per_fg_margin','rebound_margin','steal_margin','turnover_margin', 'season_elo',
       'Rank','AdjustO Rank','AdjustD Rank','SOS AdjEM Rank','L14_win_perc']]

In [117]:
team_data_abb_TY

Unnamed: 0,Season,TeamID,free_throw_percentage,three_point_field_goal_percentage,simple_rating_system,avg_mov,steals,turnovers,rebounds,assists,...,free_throw_per_fg_margin,rebound_margin,steal_margin,turnover_margin,season_elo,Rank,AdjustO Rank,AdjustD Rank,SOS AdjEM Rank,L14_win_perc
0,2024,1101,0.731,0.341,-3.82,-0.968750,253,404,1070,405,...,0.007,-101,47,-74,1488.993254,220.0,260.0,164.0,167.0,0.333333
1,2024,1102,0.682,0.361,-4.34,-6.193548,202,372,872,451,...,-0.090,-144,24,24,1403.425595,260.0,182.0,322.0,107.0,0.000000
2,2024,1103,0.728,0.328,3.01,8.176471,192,389,1252,447,...,0.046,156,-13,-1,1679.000385,116.0,162.0,97.0,219.0,0.600000
3,2024,1104,0.784,0.365,21.10,9.687500,232,383,1267,515,...,-0.018,142,-5,9,1927.232226,13.0,2.0,112.0,6.0,0.333333
4,2024,1105,0.719,0.285,-14.66,-7.090909,251,534,1170,344,...,-0.003,-21,-60,60,1175.494000,327.0,347.0,237.0,333.0,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,2024,1460,0.757,0.383,0.00,5.250000,200,379,1111,529,...,0.040,70,-35,-11,1524.499083,161.0,33.0,348.0,239.0,0.000000
358,2024,1461,0.741,0.372,0.96,-1.625000,165,408,1103,374,...,0.088,34,-96,48,1547.899529,156.0,133.0,209.0,86.0,0.666667
359,2024,1462,0.744,0.340,12.24,1.303030,231,381,1257,536,...,0.023,72,28,0,1814.969715,56.0,73.0,44.0,9.0,0.250000
360,2024,1463,0.709,0.347,4.80,8.548387,186,293,1150,472,...,-0.011,148,18,-46,1703.887346,84.0,89.0,90.0,127.0,0.666667


In [118]:
X.shape

(1070, 77)

In [119]:
# import math
# team1_name = team1
# team2_name = team2
# team1_id = list(team_abb[team_abb['abbreviation'] == team1_name]['TeamID'])[0]
# team2_id = list(team_abb[team_abb['abbreviation'] == team2_name]['TeamID'])[0]
# print(f'{team1_name} vs {team2_name}')


# season_data = team_data_abb_TY.query(f'Season == {season}').reset_index()
# team1_data = season_data.query(f'TeamID == {team1_id}')
# team2_data = season_data.query(f'TeamID == {team2_id}')
# match_data = team1_data.merge(team2_data, on='Season', suffixes=['', '_Opponent'])    
# match_data['Seed_Diff'] = team2_seed - team1_seed
# # Prepare the data for the match
# X_match = match_data.drop(columns=['Season', 'TeamID', 'TeamID_Opponent', 'index', 'index_Opponent'])

# # Normalize the features for the match
# X_match_norm = scaler.transform(X_match)  # Assuming scaler is already defined

# # Predict the outcome using the trained voting classifier
# win_prob = model.predict_proba(X_match_norm)[0][1]
    
# team1_win_prob = win_prob*100
# team2_win_prob = 100 - team1_win_prob
# if win_prob > 0.5:
#     team1_odds = math.ceil((100*team1_win_prob)/(team1_win_prob - 100))
#     team2_odds = -math.ceil(((100*(team2_win_prob-100))/(team2_win_prob)))
# elif win_prob < 0.5:
#     team1_odds = -math.ceil(((100*(team1_win_prob-100))/(team1_win_prob)))
#     team2_odds = math.ceil((100*team2_win_prob)/(team2_win_prob - 100))

# else:
#     team1_odds = 100
#     team2_odds = 100

    
# print(f'{team1_name} has a {round(win_prob*100,2)}% Chance of Winning ({team1_odds})')
# print(f'{team2_name} has a {round((1- win_prob)*100,2)}% Chance of Winning ({team2_odds})')
    
# predictions = {'ID': f'2023_{team1}_{team2}', 'Pred': win_prob}

In [120]:
def get_matchup(team1, team2, season, team1_seed, team2_seed, model):
    team1_name = team1
    team2_name = team2
    team1_id = list(team_abb[team_abb['abbreviation'] == team1_name]['TeamID'])[0]
    team2_id = list(team_abb[team_abb['abbreviation'] == team2_name]['TeamID'])[0]
    print(f'{team1_name} vs {team2_name}')

    season_data = team_data_abb_TY.query(f'Season == {season}').reset_index()
    team1_data = season_data.query(f'TeamID == {team1_id}')
    team2_data = season_data.query(f'TeamID == {team2_id}')
    match_data = team1_data.merge(team2_data, on='Season', suffixes=['', '_Opponent'])    
    match_data['Seed_Diff'] = team2_seed - team1_seed
    return match_data

In [121]:
import math

def predict_matchup_names(team1, team2, season, team1_seed, team2_seed, model):
    team1_name = team1
    team2_name = team2
    team1_id = list(team_abb[team_abb['abbreviation'] == team1_name]['TeamID'])[0]
    team2_id = list(team_abb[team_abb['abbreviation'] == team2_name]['TeamID'])[0]
    print(f'{team1_name} vs {team2_name}')

    season_data = team_data_abb_TY.query(f'Season == {season}').reset_index()
    team1_data = season_data.query(f'TeamID == {team1_id}')
    team2_data = season_data.query(f'TeamID == {team2_id}')
    match_data = team1_data.merge(team2_data, on='Season', suffixes=['', '_Opponent'])    
    match_data['Seed_Diff'] = team2_seed - team1_seed
    # Prepare the data for the match
    X_match = match_data.drop(columns=['Season', 'TeamID', 'TeamID_Opponent', 'index', 'index_Opponent'])

    # Normalize the features for the match
    X_match_norm = scaler.transform(X_match)  # Assuming scaler is already defined

    # Predict the outcome using the trained voting classifier
    win_prob = model.predict_proba(X_match_norm)[0][1]

    team1_win_prob = win_prob * 100
    team2_win_prob = 100 - team1_win_prob
    if win_prob > 0.5:
        team1_odds = math.ceil((100 * team1_win_prob) / (team1_win_prob - 100))
        team2_odds = -math.ceil(((100 * (team2_win_prob - 100)) / (team2_win_prob)))
    elif win_prob < 0.5:
        team1_odds = -math.ceil(((100 * (team1_win_prob - 100)) / (team1_win_prob)))
        team2_odds = math.ceil((100 * team2_win_prob) / (team2_win_prob - 100))
    else:
        team1_odds = 100
        team2_odds = 100

    print(f'{team1_name} has a {round(win_prob * 100, 2)}% Chance of Winning ({team1_odds})')
    print(f'{team2_name} has a {round((1 - win_prob) * 100, 2)}% Chance of Winning ({team2_odds})')

    # Return the predictions as a dictionary
    predictions = {'ID': f'{season}_{team1}_{team2}', 'Pred': win_prob}
    return predictions



In [122]:
def predict_matchup(team1_id, team2_id, season, team1_seed, team2_seed, model):
    team1_name = list(team_abb[team_abb['TeamID'] == team1_id]['abbreviation'])[0]
    team2_name = list(team_abb[team_abb['TeamID'] == team2_id]['abbreviation'])[0]
    print(f'{team1_name} vs {team2_name}')

    season_data = team_data_abb_TY.query(f'Season == {season}').reset_index()
    team1_data = season_data.query(f'TeamID == {team1_id}')
    team2_data = season_data.query(f'TeamID == {team2_id}')
    match_data = team1_data.merge(team2_data, on='Season', suffixes=['', '_Opponent'])    
    match_data['Seed_Diff'] = team2_seed - team1_seed
    # Prepare the data for the match
    X_match = match_data.drop(columns=['Season', 'TeamID', 'TeamID_Opponent', 'index', 'index_Opponent'])

    # Normalize the features for the match
    X_match_norm = scaler.transform(X_match)  # Assuming scaler is already defined

    # Predict the outcome using the trained voting classifier
    win_prob = model.predict_proba(X_match_norm)[0][1]

    team1_win_prob = win_prob * 100
    team2_win_prob = 100 - team1_win_prob
    if win_prob > 0.5:
        team1_odds = math.ceil((100 * team1_win_prob) / (team1_win_prob - 100))
        team2_odds = -math.ceil(((100 * (team2_win_prob - 100)) / (team2_win_prob)))
    elif win_prob < 0.5:
        team1_odds = -math.ceil(((100 * (team1_win_prob - 100)) / (team1_win_prob)))
        team2_odds = math.ceil((100 * team2_win_prob) / (team2_win_prob - 100))
    else:
        team1_odds = 100
        team2_odds = 100

    print(f'{team1_name} has a {round(win_prob * 100, 2)}% Chance of Winning ({team1_odds})')
    print(f'{team2_name} has a {round((1 - win_prob) * 100, 2)}% Chance of Winning ({team2_odds})')

    # Return the predictions as a dictionary
    predictions = {'ID': f'{season}_{team1_name}_{team2_name}', 'Pred': win_prob}
    return predictions


In [123]:
# prediction = predict_matchup(1116, 1181, 2023, 4, 2, voting_classifier)
# print("Prediction:", prediction)

In [134]:
first_four = [['WAGNER','HOWARD',16,16],['COLORADO STATE','VIRGINIA',10,10],
              ['GRAMBLING','MONTANA STATE',16,16],['COLORADO','BOISE STATE',10,10]]

#EAST
first_round = [['CONNECTICUT','STETSON',1,16],
               ['FLORIDA ATLANTIC','NORTHWESTERN',8,9],['SAN DIEGO STATE','UAB',5,12],['AUBURN','YALE',4,13],
               ['BRIGHAM YOUNG','DUQUESNE',6,11] ,['ILLINOIS','MOREHEAD STATE',3,14],['WASHINGTON STATE','DRAKE',7,10],
               ['IOWA STATE','SOUTH DAKOTA STATE',2,15],
#SOUTH              
               ['HOUSTON','LONGWOOD',1,16],
               ['NEBRASKA','TEXAS A&M',8,9],['WISCONSIN','JAMES MADISON',5,12],['DUKE','VERMONT',4,13],
               ['TEXAS TECH','NC STATE',6,11],['KENTUCKY','OAKLAND',3,14],['FLORIDA','COLORADO',7,10],['MARQUETTE','WESTERN KENTUCKY',2,15],
#WEST            
               ['NORTH CAROLINA','HOWARD',1,16],['MISSISSIPPI STATE','MICHIGAN STATE',8,9],['''SAINT MARY'S (CA)''','GRAND CANYON',5,12], 
               ['ALABAMA','COLLEGE OF CHARLESTON',4,13],['CLEMSON','NEW MEXICO',6,11], ['BAYLOR','COLGATE',3,14],
               ['DAYTON','NEVADA',7,10],['ARIZONA','LONG BEACH STATE',2,15],
#MIDWEST            
               ['PURDUE','GRAMBLING',1,16],['UTAH STATE','TCU',8,9],
               ['GONZAGA','MCNEESE STATE',5,12],['KANSAS','SAMFORD',4,13],['SOUTH CAROLINA','OREGON',6,11],
               ['CREIGHTON','AKRON',3,14],['TEXAS','COLORADO STATE',7,10],['TENNESSEE','''SAINT PETER'S''',2,15]]

second_round = [['CONNECTICUT','FLORIDA ATLANTIC',1,8],['SAN DIEGO STATE','AUBURN',5,14],['BRIGHAM YOUNG','ILLINOIS',6,3],['WASHINGTON STATE','IOWA STATE',7,2],
                
                ['HOUSTON','TEXAS A&M',1,9],['WISCONSIN','DUKE',5,4],['TEXAS TECH','KENTUCKY',6,3],['FLORIDA','MARQUETTE',7,2],
                
                ['NORTH CAROLINA','MICHIGAN STATE',1,9],['GRAND CANYON','ALABAMA',12,4],['NEW MEXICO','BAYLOR',11,3],['NEVADA','ARIZONA',10,2],
                
                ['PURDUE','TCU',1,9],['MCNEESE STATE','SAMFORD',12,13],['OREGON','CREIGHTON',11,3],['TEXAS','TENNESSEE',7,2]]

sweet_sixteen = [['CONNECTICUT','AUBURN',1,4],['ILLINOIS','IOWA STATE',3,2],['HOUSTON','DUKE',1,4],
                  ['TEXAS TECH','FLORIDA',6,7],['NORTH CAROLINA','ALABAMA',1,4],['BAYLOR','ARIZONA',3,2],['PURDUE','SAMFORD',1,13]
                 ,['CREIGHTON','TEXAS',3,7]]

elite_eight = [['CONNECTICUT','IOWA STATE',1,2],['HOUSTON','TEXAS TECH',1,6],['ALABAMA','ARIZONA',4,2],
                  ['PURDUE','CREIGHTON',1,11]]

final_four = [['CONNECTICUT','ARIZONA',1,2],['HOUSTON','PURDUE',1,1]]

finals = [['CONNECTICUT','HOUSTON',1,1]]

#one_game= [['HOUSTON','UCLA',4,12]]

In [135]:
# Define the matchups for each round
rounds = [second_round, sweet_sixteen, elite_eight, final_four, finals]#first_round, second_round, sweet_sixteen, elite_eight, final_four, finals

for i, round_matchups in enumerate(rounds):
    print(f"Round {i+1} Predictions:")
    print("------------------------")
    for matchup in round_matchups:
        if len(matchup) == 4:  # Adjust for the first_four round
            team1, team2, team1_seed, team2_seed = matchup
            season = 2024  # Assuming all matchups are for the 2023 season
        else:
            team1, team2, season, team1_seed, team2_seed = matchup

        prediction = predict_matchup_names(team1, team2, season, team1_seed, team2_seed, voting_classifier)
#         match_data = get_matchup(team1, team2, season, team1_seed, team2_seed, voting_classifier)
#         print(match_data.head())
        print(prediction)
    print()

Round 1 Predictions:
------------------------
CONNECTICUT vs FLORIDA ATLANTIC
CONNECTICUT has a 62.12% Chance of Winning (-163)
FLORIDA ATLANTIC has a 37.88% Chance of Winning (163)
{'ID': '2024_CONNECTICUT_FLORIDA ATLANTIC', 'Pred': 0.6211569625384739}
SAN DIEGO STATE vs AUBURN
SAN DIEGO STATE has a 65.79% Chance of Winning (-192)
AUBURN has a 34.21% Chance of Winning (192)
{'ID': '2024_SAN DIEGO STATE_AUBURN', 'Pred': 0.6579480824365677}
BRIGHAM YOUNG vs ILLINOIS
BRIGHAM YOUNG has a 25.75% Chance of Winning (288)
ILLINOIS has a 74.25% Chance of Winning (-288)
{'ID': '2024_BRIGHAM YOUNG_ILLINOIS', 'Pred': 0.2575119447413111}
WASHINGTON STATE vs IOWA STATE
WASHINGTON STATE has a 31.34% Chance of Winning (219)
IOWA STATE has a 68.66% Chance of Winning (-219)
{'ID': '2024_WASHINGTON STATE_IOWA STATE', 'Pred': 0.3134160089681492}
HOUSTON vs TEXAS A&M
HOUSTON has a 90.23% Chance of Winning (-923)
TEXAS A&M has a 9.77% Chance of Winning (923)
{'ID': '2024_HOUSTON_TEXAS A&M', 'Pred': 0.90227

In [136]:
match_data = get_matchup('CREIGHTON','AKRON',2024,3,14, voting_classifier)

CREIGHTON vs AKRON
