In [122]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import regex as re
from collections import defaultdict

things to look at: how accurate is the prematch probability when it ends (if a team has over 50% chance to win, how often do they actually win the game?)

need more data from other sportsbooks to calculate fair probability for spread markets 

In [123]:
date = '2025-11-22'

In [124]:
odds_df = pd.read_csv(f"../data_collection/updated_scripts/oddsapi_outputs/{date}/cbb_odds.csv")
odds_df.drop(columns=['league'], inplace=True)
odds_df.rename(columns={'price': 'odds'}, inplace=True)
odds_df['vig_prob'] = 1 / odds_df['odds']

def remove_vig_probs(df):
    df = df.copy()
    df['fair_prb'] = pd.NA

    grouped = df.groupby(['game_id', 'bookmaker', 'market'])

    for _, group in grouped:
        if len(group) < 2:
            continue
        probs = group['vig_prob']
        total = probs.sum()
        if total == 0:
            continue
        fair_probs = (probs / total).round(4)
        df.loc[group.index, 'fair_prb'] = fair_probs

    return df

odds_df = remove_vig_probs(odds_df)


odds_winners_df = odds_df[odds_df['market'] == 'h2h'].copy()
odds_spreads_df = odds_df[odds_df['market'] == 'spreads'].copy()
odds_spreads_df = odds_spreads_df.loc[(odds_spreads_df['point'].notna()) & (odds_spreads_df['point'] > 0)]
odds_totals_df  = odds_df[odds_df['market'] == 'totals'].copy()

# Average per-team fair probabilities across DraftKings/FanDuel/Pinnacle for winners_df
mask = odds_winners_df['fair_prb'].notna()
avg_by_team = (
    odds_winners_df.loc[mask]
    .groupby(['game_id', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_winners_df.loc[mask, 'avg_fair_prb'] = avg_by_team
odds_winners_df.loc[~mask, 'avg_fair_prb'] = pd.NA

#Average fair probabilities for spreads for same game, point spread, and team
mask = odds_spreads_df['fair_prb'].notna()
avg_by_point = (
    odds_spreads_df.loc[mask]
    .groupby(['game_id', 'point', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_spreads_df['avg_fair_prb'] = avg_by_point

In [125]:
kalshi_winners_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/ncaab_winners.csv")
kalshi_totals_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/ncaab_totals.csv")
kalshi_spreads_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/ncaab_spreads.csv")

kalshi_spreads_df['points'] = kalshi_spreads_df['title'].str.extract(r'over ([\d.]+) Points\?').astype(float)

columns_to_drop = ['timestamp', 'market_type', 'yes_bid2', 'yes_ask2', 'no_bid2', 'no_ask2', 'yes_depth_bids', 'yes_depth_asks', 'no_depth_bids', 'no_depth_asks']
kalshi_winners_df.drop(columns=columns_to_drop, inplace=True)
kalshi_spreads_df.drop(columns=columns_to_drop, inplace=True)
kalshi_totals_df.drop(columns=columns_to_drop, inplace=True)

In [126]:
kalshi_spreads_df.head()

Unnamed: 0,ticker,title,status,event_start_time,yes_bid,yes_ask,no_bid,no_ask,yes_spread,no_spread,liquidity_dollars,volume_24h,points
0,KXNCAAMBSPREAD-25NOV22IONAORST-ORST9,Oregon St. wins by over 9.5 Points?,active,2025-12-06T17:30:00-05:00,0.21,0.38,0.62,0.79,0.17,0.17,5847.99,0.0,9.5
1,KXNCAAMBSPREAD-25NOV22IONAORST-ORST6,Oregon St. wins by over 6.5 Points?,active,2025-12-06T17:30:00-05:00,0.36,0.41,0.59,0.64,0.05,0.05,6301.07,13.0,6.5
2,KXNCAAMBSPREAD-25NOV22IONAORST-ORST3,Oregon St. wins by over 3.5 Points?,active,2025-12-06T17:30:00-05:00,0.51,0.53,0.47,0.49,0.02,0.02,18096.51,695.0,3.5
3,KXNCAAMBSPREAD-25NOV22IONAORST-ORST18,Oregon St. wins by over 18.5 Points?,active,2025-12-06T17:30:00-05:00,0.03,0.97,0.03,0.97,0.94,0.94,1506.76,0.0,18.5
4,KXNCAAMBSPREAD-25NOV22IONAORST-ORST15,Oregon St. wins by over 15.5 Points?,active,2025-12-06T17:30:00-05:00,0.09,0.28,0.72,0.91,0.19,0.19,5411.32,0.0,15.5


In [127]:
#get names from kalshi_winners_df
def extract_teams_from_winners(title):
    title = title.replace(" Winner?", "")
    if " at " in title:
        right, left = title.split(" at ", 1)
    else:
        return pd.Series([None, None])  
    left = re.sub(r'\bSt\.$', 'St', left.strip())
    right = re.sub(r'\bSt\.$', 'St', right.strip())
    return pd.Series([left, right])

kalshi_winners_df[['home_team', 'away_team']] = kalshi_winners_df['title'].apply(extract_teams_from_winners)
unique_rows = kalshi_winners_df.drop_duplicates(subset=['home_team', 'away_team'])
flat_teams = pd.unique(unique_rows[['home_team', 'away_team']].values.ravel())
kalshi_winners_teams = flat_teams.tolist()

#get names from kalshi_totals_df
def extract_team_from_totals(title):
    title = title.replace(": Total Points", "")
    if " at " in title:
        left = title.split(" at ", 1)[0].strip()
        left = re.sub(r'\bSt\.$', 'St', left)
        return left
    return None

kalshi_totals_df['away_team'] = kalshi_totals_df['title'].apply(extract_team_from_totals)
kalshi_totals_teams = kalshi_totals_df['away_team'].dropna().drop_duplicates().tolist()

#get names from kalshi_spreads_df
def extract_team_from_spreads(title):
    if " wins by " in title:
        team = title.split(" wins by ", 1)[0].strip()
        team = re.sub(r'\bSt\.$', 'St', team)
        return team
    return None

kalshi_spreads_df['team'] = kalshi_spreads_df['title'].apply(extract_team_from_spreads)
unique_teams_spread = kalshi_spreads_df['team'].drop_duplicates()
kalshi_spreads_teams = unique_teams_spread.tolist()

In [128]:
odds_teams_by_market = odds_df.groupby('market')['team'].unique().to_dict()

def fuzzy_match_kalshi_to_odds(kalshi_teams, odds_team_names):
    matched_kalshi = []
    matched_odds = []

    kalshi_sorted = sorted(kalshi_teams, key=lambda x: x[0] if x else '')
    remaining_odds = sorted(odds_team_names.tolist().copy(), reverse=True)

    for kalshi_name in kalshi_sorted:
        candidates = []
        for odds_name in remaining_odds:
            if kalshi_name in odds_name:
                candidates.append(odds_name)
        if len(candidates) == 1:
            matched_kalshi.append(kalshi_name)
            matched_odds.append(candidates[0])

    return matched_kalshi, matched_odds

matched_data = {}

# Winners / h2h
matched_kalshi_h2h, matched_odds_h2h = fuzzy_match_kalshi_to_odds(
    kalshi_winners_teams,
    odds_teams_by_market.get('h2h', [])
)

# Spreads
matched_kalshi_spreads, matched_odds_spreads = fuzzy_match_kalshi_to_odds(
    kalshi_spreads_teams,
    odds_teams_by_market.get('spreads', [])
)

# Totals (match only Over/Under)
totals_odds_df = odds_df[odds_df['market'] == 'totals']
odds_totals_teams = pd.unique(totals_odds_df[['home_team', 'away_team']].values.ravel())
matched_kalshi_totals, matched_odds_totals = fuzzy_match_kalshi_to_odds(
    kalshi_totals_teams,
    odds_totals_teams
)

matched_names = {
    'h2h': {
        'kalshi': matched_kalshi_h2h,
        'odds': matched_odds_h2h
    },
    'spreads': {
        'kalshi': matched_kalshi_spreads,
        'odds': matched_odds_spreads
    },
    'totals': {
        'kalshi': matched_kalshi_totals,
        'odds': matched_odds_totals
    }
}


In [129]:
assert(len(matched_names['h2h']['kalshi']) == len(matched_names['h2h']['odds']))
assert(len(matched_names['spreads']['kalshi']) == len(matched_names['spreads']['odds']))
assert(len(matched_names['totals']['kalshi']) == len(matched_names['totals']['odds']))

In [130]:
print("Missed teams in SPREAD market:\n")
for n in kalshi_spreads_teams:
    if n not in matched_names['spreads']['kalshi']:
        print(f'{n}\n')

Missed teams in SPREAD market:

Northern Colorado



In [131]:
odds_winners_df = odds_winners_df[
    odds_winners_df['home_team'].isin(matched_names['h2h']['odds']) |
    odds_winners_df['away_team'].isin(matched_names['h2h']['odds'])
].drop_duplicates(subset='team').reset_index(drop=True)

kalshi_winners_df = kalshi_winners_df[
    kalshi_winners_df['home_team'].isin(matched_names['h2h']['kalshi']) |
    kalshi_winners_df['away_team'].isin(matched_names['h2h']['kalshi'])
].reset_index(drop=True)

odds_spreads_df = odds_spreads_df[odds_spreads_df['team'].isin(matched_names['spreads']['odds'])].reset_index(drop=True)
kalshi_spreads_df = kalshi_spreads_df[kalshi_spreads_df['team'].isin(matched_names['spreads']['kalshi'])].reset_index(drop=True)

odds_totals_df = odds_totals_df[
    odds_totals_df['home_team'].isin(matched_names['totals']['odds']) |
    odds_totals_df['away_team'].isin(matched_names['totals']['odds'])
].reset_index(drop=True)
kalshi_totals_df = kalshi_totals_df[kalshi_totals_df['away_team'].isin(matched_names['totals']['kalshi'])].reset_index(drop=True)


In [87]:
# Specify the columns to extract
kalshi_cols = ['ticker', 'yes_bid', 'yes_ask', 'home_team', 'away_team']
odds_cols = ['market', 'start_time', 'team', 'home_team', 'away_team', 'avg_fair_prb']

# Rename overlapping columns in odds to prevent clashes
odds_subset = odds_winners_df[odds_cols].rename(columns={
    'home_team': 'odds_home_team',
    'away_team': 'odds_away_team'
})

kalshi_subset = kalshi_winners_df[kalshi_cols].rename(columns={
    'home_team': 'kalshi_home_team',
    'away_team': 'kalshi_away_team'
})

seen_tickers = set()
seen_rows = {}
combined_rows = []

# Loop through Kalshi rows
for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['kalshi_home_team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_home_team']
        if kalshi_home in odds_home:
            ticker = kalshi_row['ticker']
            if ticker not in seen_tickers:
                seen_tickers.add(ticker)
                seen_rows[ticker] = odds_row
                continue
            else:
                curr_prb = odds_row['avg_fair_prb']
                prev_prb = seen_rows[ticker]['avg_fair_prb']
                midpoint = (kalshi_row['yes_bid'] + kalshi_row['yes_ask']) / 2
                if ((curr_prb - midpoint) ** 2) < ((prev_prb - midpoint) ** 2):
                    combined_row = pd.concat([kalshi_row, odds_row])
                else:
                    combined_row = pd.concat([kalshi_row, seen_rows[ticker]])
                combined_rows.append(combined_row)
                break

combined_winners_df = pd.DataFrame(combined_rows)
combined_winners_df = combined_winners_df.reset_index(drop=True)

In [88]:
combined_winners_df

Unnamed: 0,ticker,yes_bid,yes_ask,kalshi_home_team,kalshi_away_team,market,start_time,team,odds_home_team,odds_away_team,avg_fair_prb
0,KXNCAAMBGAME-25NOV22IONAORST-ORST,0.61,0.64,Oregon St,Iona,h2h,2025-11-22 16:30:00 CST,Oregon St Beavers,Oregon St Beavers,Iona Gaels,0.6177
1,KXNCAAMBGAME-25NOV22IONAORST-IONA,0.36,0.38,Oregon St,Iona,h2h,2025-11-22 16:30:00 CST,Iona Gaels,Oregon St Beavers,Iona Gaels,0.3823
2,KXNCAAMBGAME-25NOV22UNCOPORT-UNCO,0.59,0.61,Portland,Northern Colorado,h2h,2025-11-22 19:00:00 CST,N Colorado Bears,Portland Pilots,N Colorado Bears,0.604967
3,KXNCAAMBGAME-25NOV22UNCOPORT-PORT,0.39,0.41,Portland,Northern Colorado,h2h,2025-11-22 19:00:00 CST,Portland Pilots,Portland Pilots,N Colorado Bears,0.395033
4,KXNCAAMBGAME-25NOV22MILWWICH-WICH,0.84,0.85,Wichita St,Milwaukee,h2h,2025-11-22 18:00:00 CST,Wichita St Shockers,Wichita St Shockers,Milwaukee Panthers,0.833433
5,KXNCAAMBGAME-25NOV22MILWWICH-MILW,0.16,0.18,Wichita St,Milwaukee,h2h,2025-11-22 18:00:00 CST,Milwaukee Panthers,Wichita St Shockers,Milwaukee Panthers,0.166567
6,KXNCAAMBGAME-25NOV22WEBBRICH-WEBB,0.04,0.06,Richmond,Gardner-Webb,h2h,2025-11-22 18:00:00 CST,Gardner-Webb Bulldogs,Richmond Spiders,Gardner-Webb Bulldogs,0.0573
7,KXNCAAMBGAME-25NOV22WEBBRICH-RICH,0.94,0.96,Richmond,Gardner-Webb,h2h,2025-11-22 18:00:00 CST,Richmond Spiders,Richmond Spiders,Gardner-Webb Bulldogs,0.9427
8,KXNCAAMBGAME-25NOV22SFMINN-SF,0.51,0.53,Minnesota,San Francisco,h2h,2025-11-22 16:30:00 CST,San Francisco Dons,Minnesota Golden Gophers,San Francisco Dons,0.507833
9,KXNCAAMBGAME-25NOV22SFMINN-MINN,0.47,0.49,Minnesota,San Francisco,h2h,2025-11-22 16:30:00 CST,Minnesota Golden Gophers,Minnesota Golden Gophers,San Francisco Dons,0.492167


In [90]:
combined_winners_df.loc[(combined_winners_df['avg_fair_prb'] > combined_winners_df['yes_ask'] + 0.01) |
                        (combined_winners_df['avg_fair_prb'] < combined_winners_df['yes_bid'] - 0.01)]

Unnamed: 0,ticker,yes_bid,yes_ask,kalshi_home_team,kalshi_away_team,market,start_time,team,odds_home_team,odds_away_team,avg_fair_prb
12,KXNCAAMBGAME-25NOV22PROVPSU-PSU,0.4,0.41,Penn St,Providence,h2h,2025-11-22 15:00:00 CST,Penn State Nittany Lions,Penn State Nittany Lions,Providence Friars,0.380333
13,KXNCAAMBGAME-25NOV22PROVPSU-PROV,0.59,0.6,Penn St,Providence,h2h,2025-11-22 15:00:00 CST,Providence Friars,Penn State Nittany Lions,Providence Friars,0.619667


In [None]:
kalshi_cols = ['ticker', 'yes_bid', 'yes_ask', 'team', 'points']
odds_cols = ['market', 'start_time', 'team', 'home_team', 'away_team', 'avg_fair_prb', 'point']

odds_subset = odds_spreads_df[odds_cols].rename(columns={
    'home_team': 'odds_home_team',
    'away_team': 'odds_away_team',
    'team': 'odds_team'
})

kalshi_subset = kalshi_spreads_df[kalshi_cols]

combined_rows = []

for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_team']
        if (kalshi_home in odds_home) and (kalshi_row['points'] == odds_row['point']): 
            combined_row = pd.concat([kalshi_row, odds_row])
            combined_rows.append(combined_row)

combined_spreads_df = pd.DataFrame(combined_rows).drop_duplicates(subset='ticker') #only works because oddsapi only pulls odds 
combined_spreads_df = combined_spreads_df.reset_index(drop=True)                    #for only one point line for each bookmaker 



In [137]:
combined_spreads_df

Unnamed: 0,ticker,yes_bid,yes_ask,team,points,market,start_time,odds_team,odds_home_team,odds_away_team,avg_fair_prb,point
0,KXNCAAMBSPREAD-25NOV22IONAORST-IONA3,0.22,0.41,Iona,3.5,spreads,2025-11-22 16:30:00 CST,Iona Gaels,Oregon St Beavers,Iona Gaels,0.497833,3.5
1,KXNCAAMBSPREAD-25NOV22UNCOPORT-PORT3,0.22,0.32,Portland,3.5,spreads,2025-11-22 19:00:00 CST,Portland Pilots,Portland Pilots,N Colorado Bears,0.502167,3.5
2,KXNCAAMBSPREAD-25NOV22SFMINN-MINN1,0.43,0.47,Minnesota,1.5,spreads,2025-11-22 16:30:00 CST,Minnesota Golden Gophers,Minnesota Golden Gophers,San Francisco Dons,0.52095,1.5
