In [519]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import regex as re
from collections import defaultdict

In [520]:
odds_df = pd.read_csv("../data_collection/updated_scripts/oddsapi_outputs/2025-11-14/cbb_odds_2025-11-14.csv")
odds_df.drop(columns=['league'], inplace=True)
odds_df.rename(columns={'price': 'odds'}, inplace=True)
odds_df['vig_prob'] = 1 / odds_df['odds']

def remove_vig_probs(df):
    df = df.copy()
    df['fair_prb'] = pd.NA

    grouped = df.groupby(['game_id', 'bookmaker', 'market'])

    for _, group in grouped:
        if len(group) < 2:
            continue
        probs = group['vig_prob']
        total = probs.sum()
        if total == 0:
            continue
        fair_probs = (probs / total).round(4)
        df.loc[group.index, 'fair_prb'] = fair_probs

    return df

odds_df = remove_vig_probs(odds_df)


odds_winners_df = odds_df[odds_df['market'] == 'h2h'].copy()
odds_spreads_df = odds_df[odds_df['market'] == 'spreads'].copy()
odds_spreads_df = odds_spreads_df.loc[(odds_spreads_df['point'].notna()) & (odds_spreads_df['point'] > 0)]
odds_totals_df  = odds_df[odds_df['market'] == 'totals'].copy()

# Average per-team fair probabilities across DraftKings/FanDuel/Pinnacle for winners_df
mask = odds_winners_df['fair_prb'].notna()
avg_by_team = (
    odds_winners_df.loc[mask]
    .groupby(['game_id', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_winners_df.loc[mask, 'avg_fair_prb'] = avg_by_team
odds_winners_df.loc[~mask, 'avg_fair_prb'] = pd.NA

mask = odds_spreads_df['fair_prb'].notna()
avg_by_point = (
    odds_spreads_df.loc[mask]
    .groupby(['game_id', 'point', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_spreads_df['avg_fair_prb'] = avg_by_point

In [521]:
kalshi_winners_df = pd.read_csv("../data_collection/updated_scripts/kalshi_data_logs/2025-11-14/ncaab_winners.csv")
kalshi_totals_df = pd.read_csv("../data_collection/updated_scripts/kalshi_data_logs/2025-11-14/ncaab_totals.csv")
kalshi_spreads_df = pd.read_csv("../data_collection/updated_scripts/kalshi_data_logs/2025-11-14/ncaab_spreads.csv")

kalshi_spreads_df['points'] = kalshi_spreads_df['title'].str.extract(r'over ([\d.]+) Points\?').astype(float)

columns_to_drop = ['timestamp', 'market_type', 'yes_bid2', 'yes_ask2', 'no_bid2', 'no_ask2', 'yes_depth_bids', 'yes_depth_asks', 'no_depth_bids', 'no_depth_asks']
kalshi_winners_df.drop(columns=columns_to_drop, inplace=True)
kalshi_spreads_df.drop(columns=columns_to_drop, inplace=True)
kalshi_totals_df.drop(columns=columns_to_drop, inplace=True)

In [522]:
kalshi_spreads_df.head()

Unnamed: 0,ticker,title,status,event_start_time,yes_bid,yes_ask,no_bid,no_ask,yes_spread,no_spread,liquidity_dollars,volume_24h,points
0,KXNCAAMBSPREAD-25NOV14ALBYURI-URI7,Rhode Island wins by over 7.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.92,0.08,0.96,0.88,0.88,691.23,3.0,7.5
1,KXNCAAMBSPREAD-25NOV14ALBYURI-URI4,Rhode Island wins by over 4.5 Points?,active,2025-11-28T19:00:00-05:00,0.12,0.97,0.03,0.88,0.85,0.85,695.8,5.0,4.5
2,KXNCAAMBSPREAD-25NOV14ALBYURI-URI28,Rhode Island wins by over 28.5 Points?,active,2025-11-28T19:00:00-05:00,0.03,0.51,0.49,0.97,0.48,0.48,650.79,0.0,28.5
3,KXNCAAMBSPREAD-25NOV14ALBYURI-URI25,Rhode Island wins by over 25.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.96,0.04,0.96,0.92,0.92,643.26,125.0,25.5
4,KXNCAAMBSPREAD-25NOV14ALBYURI-URI22,Rhode Island wins by over 22.5 Points?,active,2025-11-28T19:00:00-05:00,0.27,0.35,0.65,0.73,0.08,0.08,905.58,125.0,22.5


In [523]:
#get names from kalshi_winners_df
def extract_teams_from_winners(title):
    title = title.replace(" Winner?", "")
    if " at " in title:
        right, left = title.split(" at ", 1)
    else:
        return pd.Series([None, None])  
    left = re.sub(r'\bSt\.$', 'St', left.strip())
    right = re.sub(r'\bSt\.$', 'St', right.strip())
    return pd.Series([left, right])

kalshi_winners_df[['home_team', 'away_team']] = kalshi_winners_df['title'].apply(extract_teams_from_winners)
unique_rows = kalshi_winners_df.drop_duplicates(subset=['home_team', 'away_team'])
flat_teams = pd.unique(unique_rows[['home_team', 'away_team']].values.ravel())
kalshi_winners_teams = flat_teams.tolist()

#get names from kalshi_totals_df
def extract_team_from_totals(title):
    title = title.replace(": Total Points", "")
    if " at " in title:
        left = title.split(" at ", 1)[0].strip()
        left = re.sub(r'\bSt\.$', 'St', left)
        return left
    return None

kalshi_totals_df['away_team'] = kalshi_totals_df['title'].apply(extract_team_from_totals)
kalshi_totals_teams = kalshi_totals_df['away_team'].dropna().drop_duplicates().tolist()

#get names from kalshi_spreads_df
def extract_team_from_spreads(title):
    if " wins by " in title:
        team = title.split(" wins by ", 1)[0].strip()
        team = re.sub(r'\bSt\.$', 'St', team)
        return team
    return None

kalshi_spreads_df['team'] = kalshi_spreads_df['title'].apply(extract_team_from_spreads)
unique_teams_spread = kalshi_spreads_df['team'].drop_duplicates()
kalshi_spreads_teams = unique_teams_spread.tolist()

In [524]:
kalshi_spreads_df.head()

Unnamed: 0,ticker,title,status,event_start_time,yes_bid,yes_ask,no_bid,no_ask,yes_spread,no_spread,liquidity_dollars,volume_24h,points,team
0,KXNCAAMBSPREAD-25NOV14ALBYURI-URI7,Rhode Island wins by over 7.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.92,0.08,0.96,0.88,0.88,691.23,3.0,7.5,Rhode Island
1,KXNCAAMBSPREAD-25NOV14ALBYURI-URI4,Rhode Island wins by over 4.5 Points?,active,2025-11-28T19:00:00-05:00,0.12,0.97,0.03,0.88,0.85,0.85,695.8,5.0,4.5,Rhode Island
2,KXNCAAMBSPREAD-25NOV14ALBYURI-URI28,Rhode Island wins by over 28.5 Points?,active,2025-11-28T19:00:00-05:00,0.03,0.51,0.49,0.97,0.48,0.48,650.79,0.0,28.5,Rhode Island
3,KXNCAAMBSPREAD-25NOV14ALBYURI-URI25,Rhode Island wins by over 25.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.96,0.04,0.96,0.92,0.92,643.26,125.0,25.5,Rhode Island
4,KXNCAAMBSPREAD-25NOV14ALBYURI-URI22,Rhode Island wins by over 22.5 Points?,active,2025-11-28T19:00:00-05:00,0.27,0.35,0.65,0.73,0.08,0.08,905.58,125.0,22.5,Rhode Island


In [525]:
odds_df.head()

Unnamed: 0,sport,game_id,start_time,bookmaker,market,team,odds,point,home_team,away_team,vig_prob,fair_prb
0,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,h2h,Charleston Southern Buccaneers,1.43,,Lindenwood Lions,Charleston Southern Buccaneers,0.699301,0.6579
1,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,h2h,Lindenwood Lions,2.75,,Lindenwood Lions,Charleston Southern Buccaneers,0.363636,0.3421
2,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,spreads,Charleston Southern Buccaneers,1.77,-2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.564972,0.5305
3,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,spreads,Lindenwood Lions,2.0,2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.5,0.4695
4,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,totals,Over,1.83,150.5,Lindenwood Lions,Charleston Southern Buccaneers,0.546448,0.5107


In [526]:
odds_teams_by_market = odds_df.groupby('market')['team'].unique().to_dict()

def fuzzy_match_kalshi_to_odds(kalshi_teams, odds_team_names):
    matched_kalshi = []
    matched_odds = []

    kalshi_sorted = sorted(kalshi_teams, key=lambda x: x[0] if x else '')
    remaining_odds = sorted(odds_team_names.tolist().copy(), reverse=True)

    for kalshi_name in kalshi_sorted:
        candidates = []
        for odds_name in remaining_odds:
            if kalshi_name in odds_name:
                candidates.append(odds_name)
        if len(candidates) == 1:
            matched_kalshi.append(kalshi_name)
            matched_odds.append(candidates[0])

    return matched_kalshi, matched_odds

matched_data = {}

# Winners / h2h
matched_kalshi_h2h, matched_odds_h2h = fuzzy_match_kalshi_to_odds(
    kalshi_winners_teams,
    odds_teams_by_market.get('h2h', [])
)

# Spreads
matched_kalshi_spreads, matched_odds_spreads = fuzzy_match_kalshi_to_odds(
    kalshi_spreads_teams,
    odds_teams_by_market.get('spreads', [])
)

# Totals (match only Over/Under)
totals_odds_df = odds_df[odds_df['market'] == 'totals']
odds_totals_teams = pd.unique(totals_odds_df[['home_team', 'away_team']].values.ravel())
matched_kalshi_totals, matched_odds_totals = fuzzy_match_kalshi_to_odds(
    kalshi_totals_teams,
    odds_totals_teams
)

matched_names = {
    'h2h': {
        'kalshi': matched_kalshi_h2h,
        'odds': matched_odds_h2h
    },
    'spreads': {
        'kalshi': matched_kalshi_spreads,
        'odds': matched_odds_spreads
    },
    'totals': {
        'kalshi': matched_kalshi_totals,
        'odds': matched_odds_totals
    }
}


In [527]:
assert(len(matched_names['h2h']['kalshi']) == len(matched_names['h2h']['odds']))
assert(len(matched_names['spreads']['kalshi']) == len(matched_names['spreads']['odds']))
assert(len(matched_names['totals']['kalshi']) == len(matched_names['totals']['odds']))

In [528]:
odds_df[odds_df['market'] == 'spreads']

Unnamed: 0,sport,game_id,start_time,bookmaker,market,team,odds,point,home_team,away_team,vig_prob,fair_prb
2,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,spreads,Charleston Southern Buccaneers,1.77,-2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.564972,0.5305
3,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,DraftKings,spreads,Lindenwood Lions,2.00,2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.500000,0.4695
8,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,FanDuel,spreads,Charleston Southern Buccaneers,1.83,-2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.546448,0.5133
9,CBB,ef3972b9d28e04dd11342475593c9d1f,2025-11-14 17:29:18 CST,FanDuel,spreads,Lindenwood Lions,1.93,2.5,Lindenwood Lions,Charleston Southern Buccaneers,0.518135,0.4867
14,CBB,1a30506073314e98c9d3ec37b18818ae,2025-11-14 17:35:00 CST,FanDuel,spreads,Eastern Michigan Eagles,1.78,1.5,Eastern Michigan Eagles,IUPUI Jaguars,0.561798,0.5241
...,...,...,...,...,...,...,...,...,...,...,...,...
627,CBB,65ab0c40570bc45640aeff69e13a7e15,2025-11-14 23:00:00 CST,FanDuel,spreads,Manhattan Jaspers,1.91,15.5,Hawai'i Rainbow Warriors,Manhattan Jaspers,0.523560,0.5
632,CBB,65ab0c40570bc45640aeff69e13a7e15,2025-11-14 23:00:00 CST,DraftKings,spreads,Hawai'i Rainbow Warriors,1.87,-15.5,Hawai'i Rainbow Warriors,Manhattan Jaspers,0.534759,0.5105
633,CBB,65ab0c40570bc45640aeff69e13a7e15,2025-11-14 23:00:00 CST,DraftKings,spreads,Manhattan Jaspers,1.95,15.5,Hawai'i Rainbow Warriors,Manhattan Jaspers,0.512821,0.4895
636,CBB,65ab0c40570bc45640aeff69e13a7e15,2025-11-14 23:00:00 CST,Pinnacle,spreads,Hawai'i Rainbow Warriors,1.96,-16.0,Hawai'i Rainbow Warriors,Manhattan Jaspers,0.510204,0.4909


In [529]:
odds_winners_df = odds_winners_df[
    odds_winners_df['home_team'].isin(matched_names['h2h']['odds']) |
    odds_winners_df['away_team'].isin(matched_names['h2h']['odds'])
].drop_duplicates(subset='team').reset_index(drop=True)

kalshi_winners_df = kalshi_winners_df[
    kalshi_winners_df['home_team'].isin(matched_names['h2h']['kalshi']) |
    kalshi_winners_df['away_team'].isin(matched_names['h2h']['kalshi'])
].reset_index(drop=True)

odds_spreads_df = odds_spreads_df[odds_spreads_df['team'].isin(matched_names['spreads']['odds'])].reset_index(drop=True)
kalshi_spreads_df = kalshi_spreads_df[kalshi_spreads_df['team'].isin(matched_names['spreads']['kalshi'])].reset_index(drop=True)

odds_totals_df = odds_totals_df[
    odds_totals_df['home_team'].isin(matched_names['totals']['odds']) |
    odds_totals_df['away_team'].isin(matched_names['totals']['odds'])
].reset_index(drop=True)
kalshi_totals_df = kalshi_totals_df[kalshi_totals_df['away_team'].isin(matched_names['totals']['kalshi'])].reset_index(drop=True)


In [530]:
odds_spreads_df

Unnamed: 0,sport,game_id,start_time,bookmaker,market,team,odds,point,home_team,away_team,vig_prob,fair_prb,avg_fair_prb
0,CBB,d6497d18b189addaaea01350456a471a,2025-11-14 18:00:00 CST,FanDuel,spreads,Fordham Rams,1.94,4.5,Iona Gaels,Fordham Rams,0.515464,0.484,0.484
1,CBB,d6497d18b189addaaea01350456a471a,2025-11-14 18:00:00 CST,DraftKings,spreads,Fordham Rams,1.8,5.5,Iona Gaels,Fordham Rams,0.555556,0.52,0.52
2,CBB,9a78ff271734f93551602ae855d197ec,2025-11-14 18:32:13 CST,DraftKings,spreads,UAB Blazers,1.77,2.5,UAB Blazers,High Point Panthers,0.564972,0.5305,0.51925
3,CBB,9a78ff271734f93551602ae855d197ec,2025-11-14 18:32:13 CST,FanDuel,spreads,UAB Blazers,1.85,2.5,UAB Blazers,High Point Panthers,0.540541,0.508,0.51925
4,CBB,1df632fd747b044023ea9c6d4539f6a6,2025-11-14 18:33:06 CST,DraftKings,spreads,New Orleans Privateers,1.8,2.5,Tulane Green Wave,New Orleans Privateers,0.555556,0.52,0.52065
5,CBB,1df632fd747b044023ea9c6d4539f6a6,2025-11-14 18:33:06 CST,FanDuel,spreads,New Orleans Privateers,1.8,2.5,Tulane Green Wave,New Orleans Privateers,0.555556,0.5213,0.52065
6,CBB,ca0a4e03bb2dddcc0cdee0dfa0c093ba,2025-11-14 19:00:00 CST,FanDuel,spreads,UCF Knights,1.91,10.5,Texas A&M Aggies,UCF Knights,0.52356,0.5,0.50525
7,CBB,ca0a4e03bb2dddcc0cdee0dfa0c093ba,2025-11-14 19:00:00 CST,DraftKings,spreads,UCF Knights,1.87,10.5,Texas A&M Aggies,UCF Knights,0.534759,0.5105,0.50525
8,CBB,ca0a4e03bb2dddcc0cdee0dfa0c093ba,2025-11-14 19:00:00 CST,Pinnacle,spreads,UCF Knights,1.93,10.0,Texas A&M Aggies,UCF Knights,0.518135,0.5,0.5
9,CBB,1adbd0d0bc41165eaacb74c30117c9d9,2025-11-14 19:00:01 CST,FanDuel,spreads,Xavier Musketeers,1.94,15.5,Iowa Hawkeyes,Xavier Musketeers,0.515464,0.4921,0.497333


In [531]:
kalshi_spreads_df

Unnamed: 0,ticker,title,status,event_start_time,yes_bid,yes_ask,no_bid,no_ask,yes_spread,no_spread,liquidity_dollars,volume_24h,points,team
0,KXNCAAMBSPREAD-25NOV14ALBYURI-URI7,Rhode Island wins by over 7.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.92,0.08,0.96,0.88,0.88,691.23,3.0,7.5,Rhode Island
1,KXNCAAMBSPREAD-25NOV14ALBYURI-URI4,Rhode Island wins by over 4.5 Points?,active,2025-11-28T19:00:00-05:00,0.12,0.97,0.03,0.88,0.85,0.85,695.80,5.0,4.5,Rhode Island
2,KXNCAAMBSPREAD-25NOV14ALBYURI-URI28,Rhode Island wins by over 28.5 Points?,active,2025-11-28T19:00:00-05:00,0.03,0.51,0.49,0.97,0.48,0.48,650.79,0.0,28.5,Rhode Island
3,KXNCAAMBSPREAD-25NOV14ALBYURI-URI25,Rhode Island wins by over 25.5 Points?,active,2025-11-28T19:00:00-05:00,0.04,0.96,0.04,0.96,0.92,0.92,643.26,125.0,25.5,Rhode Island
4,KXNCAAMBSPREAD-25NOV14ALBYURI-URI22,Rhode Island wins by over 22.5 Points?,active,2025-11-28T19:00:00-05:00,0.27,0.35,0.65,0.73,0.08,0.08,905.58,125.0,22.5,Rhode Island
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,KXNCAAMBSPREAD-25NOV14CSBMISS-MISS24,Ole Miss wins by over 24.5 Points?,active,2025-11-28T19:30:00-05:00,0.04,0.95,0.05,0.96,0.91,0.91,677.84,2185.0,24.5,Ole Miss
212,KXNCAAMBSPREAD-25NOV14CSBMISS-MISS21,Ole Miss wins by over 21.5 Points?,active,2025-11-28T19:30:00-05:00,0.03,0.97,0.03,0.97,0.94,0.94,587.05,0.0,21.5,Ole Miss
213,KXNCAAMBSPREAD-25NOV14CSBMISS-MISS18,Ole Miss wins by over 18.5 Points?,active,2025-11-28T19:30:00-05:00,0.03,0.97,0.03,0.97,0.94,0.94,587.57,0.0,18.5,Ole Miss
214,KXNCAAMBSPREAD-25NOV14CSBMISS-MISS15,Ole Miss wins by over 15.5 Points?,active,2025-11-28T19:30:00-05:00,0.03,0.97,0.03,0.97,0.94,0.94,599.92,0.0,15.5,Ole Miss


In [532]:
# Specify the columns to extract
kalshi_cols = ['ticker', 'event_start_time', 'yes_bid', 'yes_ask', 'no_bid', 'no_ask', 'home_team', 'away_team']
odds_cols = ['market', 'team', 'home_team', 'away_team', 'avg_fair_prb']

# Rename overlapping columns in odds to prevent clashes
odds_subset = odds_winners_df[odds_cols].rename(columns={
    'home_team': 'odds_home_team',
    'away_team': 'odds_away_team'
})

kalshi_subset = kalshi_winners_df[kalshi_cols]

combined_rows = []

# Loop through Kalshi rows
for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['home_team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_home_team']
        if kalshi_home in odds_home:
            combined_row = pd.concat([kalshi_row, odds_row])
            combined_rows.append(combined_row)

# Create final DataFrame
combined_df = pd.DataFrame(combined_rows)
combined_df = combined_df.reset_index(drop=True)
filtered_df = combined_df[(combined_df.index % 4) .isin([1, 2])]


In [533]:
combined_df.loc[combined_df['home_team'] == 'Tulane']

Unnamed: 0,ticker,event_start_time,yes_bid,yes_ask,no_bid,no_ask,home_team,away_team,market,team,odds_home_team,odds_away_team,avg_fair_prb
60,KXNCAAMBGAME-25NOV14UNOTULN-UNO,2025-11-28T19:30:00-05:00,0.36,0.37,0.63,0.64,Tulane,New Orleans,h2h,New Orleans Privateers,Tulane Green Wave,New Orleans Privateers,0.41455
61,KXNCAAMBGAME-25NOV14UNOTULN-UNO,2025-11-28T19:30:00-05:00,0.36,0.37,0.63,0.64,Tulane,New Orleans,h2h,Tulane Green Wave,Tulane Green Wave,New Orleans Privateers,0.58545
62,KXNCAAMBGAME-25NOV14UNOTULN-TULN,2025-11-28T19:30:00-05:00,0.63,0.64,0.36,0.37,Tulane,New Orleans,h2h,New Orleans Privateers,Tulane Green Wave,New Orleans Privateers,0.41455
63,KXNCAAMBGAME-25NOV14UNOTULN-TULN,2025-11-28T19:30:00-05:00,0.63,0.64,0.36,0.37,Tulane,New Orleans,h2h,Tulane Green Wave,Tulane Green Wave,New Orleans Privateers,0.58545


In [538]:
# Specify the columns to extract
kalshi_cols = ['ticker', 'event_start_time', 'yes_bid', 'yes_ask', 'no_bid', 'no_ask', 'team', 'points']
odds_cols = ['market', 'team', 'home_team', 'away_team', 'avg_fair_prb', 'point']

# Rename overlapping columns in odds to prevent clashes
odds_subset = odds_spreads_df[odds_cols].rename(columns={
    'team': 'odds_team'
})

kalshi_subset = kalshi_spreads_df[kalshi_cols]

combined_rows = []

# Loop through Kalshi rows
for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_team']
        if (kalshi_home in odds_home) and (kalshi_row['points'] == odds_row['point']): #the point spread should be the same
            combined_row = pd.concat([kalshi_row, odds_row])
            combined_rows.append(combined_row)

# Create final DataFrame
combined_df = pd.DataFrame(combined_rows)
combined_df = combined_df.reset_index(drop=True)
filtered_df = combined_df[(combined_df.index % 4) .isin([1, 2])]
