In [238]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import regex as re
import math
from collections import defaultdict

from rapidfuzz.fuzz import ratio

things to look at: how accurate is the prematch probability when it ends (if a team has over 50% chance to win, how often do they actually win the game?)

if my fair probability is wildely different than the midprice, should I weigh it less or find a new fair probability? How often is my direction right?

need more data from other sportsbooks to calculate fair probability for spread markets 

In [239]:
date = '2025-11-22'
odds_sport = 'nba' #cbb, cfb, nba, nfl
kalshi_sport = 'nba' #ncaab, ncaaf, nba, nfl

In [240]:
odds_df = pd.read_csv(f"../data_collection/updated_scripts/oddsapi_outputs/{date}/{odds_sport}_odds.csv")
odds_df.drop(columns=['league'], inplace=True)
odds_df.rename(columns={'price': 'odds'}, inplace=True)
odds_df['vig_prob'] = 1 / odds_df['odds']

def remove_vig_probs(df):
    df = df.copy()
    df['fair_prb'] = pd.NA

    grouped = df.groupby(['game_id', 'bookmaker', 'market'])

    for _, group in grouped:
        if len(group) < 2:
            continue
        probs = group['vig_prob']
        total = probs.sum()
        if total == 0:
            continue
        fair_probs = (probs / total).round(4)
        df.loc[group.index, 'fair_prb'] = fair_probs

    return df

odds_df = remove_vig_probs(odds_df)


odds_winners_df = odds_df[odds_df['market'] == 'h2h'].copy()
odds_spreads_df = odds_df[odds_df['market'] == 'spreads'].copy()
odds_spreads_df = odds_spreads_df.loc[(odds_spreads_df['point'].notna()) & (odds_spreads_df['point'] > 0)]
odds_totals_df  = odds_df[odds_df['market'] == 'totals'].copy()

# Average per-team fair probabilities across DraftKings/FanDuel/Pinnacle for winners_df
mask = odds_winners_df['fair_prb'].notna()
avg_by_team = (
    odds_winners_df.loc[mask]
    .groupby(['game_id', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_winners_df.loc[mask, 'avg_fair_prb'] = avg_by_team
odds_winners_df.loc[~mask, 'avg_fair_prb'] = pd.NA

#Average fair probabilities for spreads for same game, point spread, and team
mask = odds_spreads_df['fair_prb'].notna()
avg_by_point = (
    odds_spreads_df.loc[mask]
    .groupby(['game_id', 'point', 'team'])['fair_prb']
    .transform('mean')
    .round(4)
)
odds_spreads_df['avg_fair_prb'] = avg_by_point

In [241]:
kalshi_winners_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/{kalshi_sport}_winners.csv")
kalshi_totals_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/{kalshi_sport}_totals.csv")
kalshi_spreads_df = pd.read_csv(f"../data_collection/updated_scripts/kalshi_data_logs/{date}/{kalshi_sport}_spreads.csv")

if kalshi_sport == 'ncaaf':
    kalshi_spreads_df['points'] = kalshi_spreads_df['title'].str.extract(r'over ([\d.]+) points\?').astype(float)
elif kalshi_sport == 'ncaab':
    kalshi_spreads_df['points'] = kalshi_spreads_df['title'].str.extract(r'over ([\d.]+) Points\?').astype(float)


columns_to_drop = ['timestamp', 'market_type', 'yes_bid2', 'yes_ask2', 'no_bid2', 'no_ask2', 'yes_depth_bids', 'yes_depth_asks', 'no_depth_bids', 'no_depth_asks']
kalshi_winners_df.drop(columns=columns_to_drop, inplace=True)
kalshi_spreads_df.drop(columns=columns_to_drop, inplace=True)
kalshi_totals_df.drop(columns=columns_to_drop, inplace=True)

In [242]:
kalshi_spreads_df.head()

Unnamed: 0,ticker,title,status,event_start_time,yes_bid,yes_ask,no_bid,no_ask,yes_spread,no_spread,liquidity_dollars,volume_24h
0,KXNBASPREAD-25NOV22SACDEN-SAC4,Sacramento wins by over 4.5 Points?,active,2025-12-06T22:00:00-05:00,0.12,0.18,0.82,0.88,0.06,0.06,7346.2,0.0
1,KXNBASPREAD-25NOV22SACDEN-SAC1,Sacramento wins by over 1.5 Points?,active,2025-12-06T22:00:00-05:00,0.14,0.22,0.78,0.86,0.08,0.08,6502.75,0.0
2,KXNBASPREAD-25NOV22SACDEN-DEN8,Denver wins by over 8.5 Points?,active,2025-12-06T22:00:00-05:00,0.59,0.6,0.4,0.41,0.01,0.01,24108.78,3430.0
3,KXNBASPREAD-25NOV22SACDEN-DEN5,Denver wins by over 5.5 Points?,active,2025-12-06T22:00:00-05:00,0.66,0.69,0.31,0.34,0.03,0.03,6151.95,103.0
4,KXNBASPREAD-25NOV22SACDEN-DEN26,Denver wins by over 26.5 Points?,active,2025-12-06T22:00:00-05:00,0.12,0.18,0.82,0.88,0.06,0.06,7353.6,0.0


In [243]:
#get names from kalshi_winners_df
def extract_teams_from_winners(title):
    title = title.replace(" Winner?", "")
    if " at " in title:
        right, left = title.split(" at ", 1)
    elif " vs " in title:
        right, left = title.split(" vs ", 1)
    else:
        return pd.Series([None, None])  
    left = re.sub(r'\bSt\.$', 'St', left.strip())
    right = re.sub(r'\bSt\.$', 'St', right.strip())
    return pd.Series([left, right])

kalshi_winners_df[['home_team', 'away_team']] = kalshi_winners_df['title'].apply(extract_teams_from_winners)
unique_rows = kalshi_winners_df.drop_duplicates(subset=['home_team', 'away_team'])
flat_teams = pd.unique(unique_rows[['home_team', 'away_team']].values.ravel())
kalshi_winners_teams = flat_teams.tolist()

#get names from kalshi_totals_df
def extract_team_from_totals(title):
    title = title.replace(": Total Points", "")
    if " at " in title:
        left = title.split(" at ", 1)[0].strip()
        left = re.sub(r'\bSt\.$', 'St', left)
        return left
    return None

kalshi_totals_df['away_team'] = kalshi_totals_df['title'].apply(extract_team_from_totals)
kalshi_totals_teams = kalshi_totals_df['away_team'].dropna().drop_duplicates().tolist()

#get names from kalshi_spreads_df
def extract_team_from_spreads(title):
    if " wins by " in title:
        team = title.split(" wins by ", 1)[0].strip()
        team = re.sub(r'\bSt\.$', 'St', team)
        return team
    return None

kalshi_spreads_df['team'] = kalshi_spreads_df['title'].apply(extract_team_from_spreads)
unique_teams_spread = kalshi_spreads_df['team'].drop_duplicates()
kalshi_spreads_teams = unique_teams_spread.tolist()

In [244]:
kalshi_winners_teams

['Dallas',
 'Memphis',
 'Denver',
 'Sacramento',
 'Chicago',
 'Washington',
 'Milwaukee',
 'Detroit',
 'New Orleans',
 'Atlanta',
 'Orlando',
 'New York K']

In [245]:
odds_teams_by_market = odds_df.groupby('market')['team'].unique().to_dict()

def fuzzy_match_kalshi_to_odds(kalshi_teams, odds_team_names):
    matched_kalshi = []
    matched_odds = []
    candidates_dict = defaultdict(list)

    kalshi_sorted = sorted(kalshi_teams, key=lambda x: x[0] if x else '')
    remaining_odds = sorted(odds_team_names.tolist().copy(), reverse=True)

    for kalshi_name in kalshi_sorted:
        candidates = []
        for odds_name in remaining_odds:
            if kalshi_name in odds_name:
                candidates.append(odds_name)
        if len(candidates) == 1:
            candidates_dict[candidates[0]].append(kalshi_name)
        elif len(candidates) > 1:
            best_fit = candidates[0]
            best_ratio = ratio(best_fit, kalshi_name)
            for name in candidates:
                curr_ratio = ratio(name, kalshi_name)
                if curr_ratio > best_ratio:
                    best_fit = name
                    best_ratio = curr_ratio
            candidates_dict[best_fit].append(kalshi_name)
    
    for odd, kalsh in candidates_dict.items():
        best_fit = kalsh[0]
        best_ratio = ratio(best_fit, odd)
        if len(kalsh) > 1:
            for name in kalsh:
                curr_ratio = ratio(name, odd)
                if curr_ratio > best_ratio:
                    best_fit = name
                    best_ratio = curr_ratio
        matched_odds.append(odd)
        matched_kalshi.append(best_fit)


    return matched_kalshi, matched_odds


# Winners / h2h
matched_kalshi_h2h, matched_odds_h2h = fuzzy_match_kalshi_to_odds(
    kalshi_winners_teams,
    odds_teams_by_market.get('h2h', [])
)

# Spreads
matched_kalshi_spreads, matched_odds_spreads = fuzzy_match_kalshi_to_odds(
    kalshi_spreads_teams,
    odds_teams_by_market.get('spreads', [])
)

# Totals (match only Over/Under)
totals_odds_df = odds_df[odds_df['market'] == 'totals']
odds_totals_teams = pd.unique(totals_odds_df[['home_team', 'away_team']].values.ravel())
matched_kalshi_totals, matched_odds_totals = fuzzy_match_kalshi_to_odds(
    kalshi_totals_teams,
    odds_totals_teams
)

matched_names = {
    'h2h': {
        'kalshi': matched_kalshi_h2h,
        'odds': matched_odds_h2h
    },
    'spreads': {
        'kalshi': matched_kalshi_spreads,
        'odds': matched_odds_spreads
    },
    'totals': {
        'kalshi': matched_kalshi_totals,
        'odds': matched_odds_totals
    }
}


In [246]:
assert(len(matched_names['h2h']['kalshi']) == len(matched_names['h2h']['odds']))
assert(len(matched_names['spreads']['kalshi']) == len(matched_names['spreads']['odds']))
assert(len(matched_names['totals']['kalshi']) == len(matched_names['totals']['odds']))

In [247]:
odds_winners_df = odds_winners_df[
    odds_winners_df['home_team'].isin(matched_names['h2h']['odds']) |
    odds_winners_df['away_team'].isin(matched_names['h2h']['odds'])
].drop_duplicates(subset='team').reset_index(drop=True)

kalshi_winners_df = kalshi_winners_df[
    kalshi_winners_df['home_team'].isin(matched_names['h2h']['kalshi']) |
    kalshi_winners_df['away_team'].isin(matched_names['h2h']['kalshi'])
].reset_index(drop=True)

odds_spreads_df = odds_spreads_df[odds_spreads_df['team'].isin(matched_names['spreads']['odds'])].reset_index(drop=True)
kalshi_spreads_df = kalshi_spreads_df[kalshi_spreads_df['team'].isin(matched_names['spreads']['kalshi'])].reset_index(drop=True)

odds_totals_df = odds_totals_df[
    odds_totals_df['home_team'].isin(matched_names['totals']['odds']) |
    odds_totals_df['away_team'].isin(matched_names['totals']['odds'])
].reset_index(drop=True)
kalshi_totals_df = kalshi_totals_df[kalshi_totals_df['away_team'].isin(matched_names['totals']['kalshi'])].reset_index(drop=True)


In [248]:
# Specify the columns to extract
kalshi_cols = ['ticker', 'yes_bid', 'yes_ask', 'home_team', 'away_team']
odds_cols = ['market', 'start_time', 'team', 'home_team', 'away_team', 'avg_fair_prb']

# Rename overlapping columns in odds to prevent clashes
odds_subset = odds_winners_df[odds_cols].rename(columns={
    'home_team': 'odds_home_team',
    'away_team': 'odds_away_team'
})

kalshi_subset = kalshi_winners_df[kalshi_cols].rename(columns={
    'home_team': 'kalshi_home_team',
    'away_team': 'kalshi_away_team'
})

seen_tickers = set()
seen_rows = {}
combined_rows = []

# Loop through Kalshi rows
for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['kalshi_home_team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_home_team']
        if kalshi_home in odds_home:
            ticker = kalshi_row['ticker']
            if ticker not in seen_tickers:
                seen_tickers.add(ticker)
                seen_rows[ticker] = odds_row
                continue
            else:
                curr_prb = odds_row['avg_fair_prb']
                prev_prb = seen_rows[ticker]['avg_fair_prb']
                midpoint = (kalshi_row['yes_bid'] + kalshi_row['yes_ask']) / 2
                if ((curr_prb - midpoint) ** 2) < ((prev_prb - midpoint) ** 2):
                    combined_row = pd.concat([kalshi_row, odds_row])
                else:
                    combined_row = pd.concat([kalshi_row, seen_rows[ticker]])
                combined_rows.append(combined_row)
                break

combined_winners_df = pd.DataFrame(combined_rows)
combined_winners_df = combined_winners_df.reset_index(drop=True)

In [249]:
combined_winners_df

Unnamed: 0,ticker,yes_bid,yes_ask,kalshi_home_team,kalshi_away_team,market,start_time,team,odds_home_team,odds_away_team,avg_fair_prb
0,KXNBAGAME-25NOV22MEMDAL-MEM,0.48,0.49,Dallas,Memphis,h2h,2025-11-22 19:40:00 CST,Memphis Grizzlies,Dallas Mavericks,Memphis Grizzlies,0.490467
1,KXNBAGAME-25NOV22MEMDAL-DAL,0.5,0.52,Dallas,Memphis,h2h,2025-11-22 19:40:00 CST,Dallas Mavericks,Dallas Mavericks,Memphis Grizzlies,0.509533
2,KXNBAGAME-25NOV22SACDEN-SAC,0.17,0.18,Denver,Sacramento,h2h,2025-11-22 21:10:00 CST,Sacramento Kings,Denver Nuggets,Sacramento Kings,0.1903
3,KXNBAGAME-25NOV22SACDEN-DEN,0.82,0.83,Denver,Sacramento,h2h,2025-11-22 21:10:00 CST,Denver Nuggets,Denver Nuggets,Sacramento Kings,0.8097
4,KXNBAGAME-25NOV22WASCHI-WAS,0.14,0.15,Chicago,Washington,h2h,2025-11-22 19:10:00 CST,Washington Wizards,Chicago Bulls,Washington Wizards,0.164933
5,KXNBAGAME-25NOV22WASCHI-CHI,0.85,0.86,Chicago,Washington,h2h,2025-11-22 19:10:00 CST,Chicago Bulls,Chicago Bulls,Washington Wizards,0.835067
6,KXNBAGAME-25NOV22DETMIL-MIL,0.24,0.25,Milwaukee,Detroit,h2h,2025-11-22 19:10:00 CST,Milwaukee Bucks,Milwaukee Bucks,Detroit Pistons,0.2568
7,KXNBAGAME-25NOV22DETMIL-DET,0.75,0.76,Milwaukee,Detroit,h2h,2025-11-22 19:10:00 CST,Detroit Pistons,Milwaukee Bucks,Detroit Pistons,0.7432
8,KXNBAGAME-25NOV22ATLNOP-NOP,0.2,0.21,New Orleans,Atlanta,h2h,2025-11-22 18:10:00 CST,New Orleans Pelicans,New Orleans Pelicans,Atlanta Hawks,0.216567
9,KXNBAGAME-25NOV22ATLNOP-ATL,0.79,0.81,New Orleans,Atlanta,h2h,2025-11-22 18:10:00 CST,Atlanta Hawks,New Orleans Pelicans,Atlanta Hawks,0.783433


In [250]:
EDGE = 0.01
KELLY_UPPERBOUND = 0.25
BANKROLL = 300.00


edge_winners_df = combined_winners_df.loc[(combined_winners_df['avg_fair_prb'] >= combined_winners_df['yes_ask'] + EDGE) |
                        (combined_winners_df['avg_fair_prb'] <= combined_winners_df['yes_bid'] - EDGE)].reset_index(drop=True)

midprice = (edge_winners_df['yes_bid'] + edge_winners_df['yes_ask']) / 2

edge_winners_df['raw_kelly'] = (
    (edge_winners_df['avg_fair_prb'] - midprice) /
    (1 - midprice)
).clip(lower=0)

total_kelly = edge_winners_df['raw_kelly'].sum()
edge_winners_df['raw_kelly'] = pd.DataFrame({
    'original': edge_winners_df['raw_kelly'],
    'normalized': (edge_winners_df['raw_kelly'] / total_kelly).round(2)
}).min(axis=1)

# Define the real_kelly logic
def scale_kelly(row):
    k = row['raw_kelly']
    p = row['avg_fair_prb']
    
    if k == 0 or pd.isna(k):
        return 0
    if 0.1 <= p < 0.25:
        return min(0.25 * k, KELLY_UPPERBOUND)
    elif 0.25 <= p < 0.5:
        return min(0.5 * k, KELLY_UPPERBOUND)
    elif 0.5 <= p < 0.75:
        return min(0.75 * k, KELLY_UPPERBOUND)
    elif 0.75 <= p < 0.9:
        return min(k, KELLY_UPPERBOUND)
    else:
        return 0  # fallback if out of range

# Apply to the DataFrame
edge_winners_df['real_kelly'] = edge_winners_df.apply(scale_kelly, axis=1).round(2)
edge_winners_df['optimal_bet'] = edge_winners_df['real_kelly'] * BANKROLL

num_contracts = edge_winners_df['optimal_bet'] // edge_winners_df['yes_bid']
edge_winners_df['num_contracts'] = num_contracts
trading_cost = np.ceil(0.0175 * num_contracts * edge_winners_df['yes_bid'] * (1 - edge_winners_df['yes_bid']))
edge_winners_df['trading_cost'] = trading_cost
profit = (1 - edge_winners_df['yes_bid']) *  num_contracts - trading_cost
edge_winners_df['profit'] = profit
edge_winners_df['ev'] = (profit * edge_winners_df['avg_fair_prb'] - edge_winners_df['optimal_bet'] * (1 - edge_winners_df['avg_fair_prb'])).round(2)
filtered_winners_df = edge_winners_df.loc[edge_winners_df['ev'] > 0].reset_index(drop=True)

In [251]:
filtered_winners_df

Unnamed: 0,ticker,yes_bid,yes_ask,kalshi_home_team,kalshi_away_team,market,start_time,team,odds_home_team,odds_away_team,avg_fair_prb,raw_kelly,real_kelly,optimal_bet,num_contracts,trading_cost,profit,ev
0,KXNBAGAME-25NOV22WASCHI-WAS,0.14,0.15,Chicago,Washington,h2h,2025-11-22 19:10:00 CST,Washington Wizards,Chicago Bulls,Washington Wizards,0.164933,0.023314,0.01,3.0,21.0,1.0,17.06,0.31


In [252]:
total_loss = np.sum(filtered_winners_df['optimal_bet'])
total_profit = np.sum(filtered_winners_df['profit'])
total_ev = np.sum(filtered_winners_df['ev'])
print(f"{odds_sport} h2h portfolio summary:\n")
print(f"Max Loss: -{total_loss:.2f}")
print(f"Max Profit: {total_profit:.2f}")
print(f"Portfolio EV: {total_ev:.2f}")


nba h2h portfolio summary:

Max Loss: -3.00
Max Profit: 17.06
Portfolio EV: 0.31


In [253]:
kalshi_cols = ['ticker', 'yes_bid', 'yes_ask', 'team', 'points']
odds_cols = ['market', 'start_time', 'team', 'home_team', 'away_team', 'avg_fair_prb', 'point']

odds_subset = odds_spreads_df[odds_cols].rename(columns={
    'home_team': 'odds_home_team',
    'away_team': 'odds_away_team',
    'team': 'odds_team'
})

kalshi_subset = kalshi_spreads_df[kalshi_cols]

combined_rows = []

for _, kalshi_row in kalshi_subset.iterrows():
    kalshi_home = kalshi_row['team']
    for _, odds_row in odds_subset.iterrows():
        odds_home = odds_row['odds_team']
        if (kalshi_home in odds_home) and (kalshi_row['points'] == odds_row['point']): 
            combined_row = pd.concat([kalshi_row, odds_row])
            combined_rows.append(combined_row)

combined_spreads_df = pd.DataFrame(combined_rows).drop_duplicates(subset='ticker') #only works because oddsapi only pulls odds 
combined_spreads_df = combined_spreads_df.reset_index(drop=True)                    #for only one point line for each bookmaker 



KeyError: "['points'] not in index"

In [None]:
combined_spreads_df

Unnamed: 0,ticker,yes_bid,yes_ask,team,points,market,start_time,odds_team,odds_home_team,odds_away_team,avg_fair_prb,point
0,KXNCAAFSPREAD-25NOV22UNMAFA-UNM14,0.17,0.26,New Mexico,14.5,spreads,2025-11-22 14:03:02 CST,New Mexico State Aggies,UTEP Miners,New Mexico State Aggies,0.5266,14.5
1,KXNCAAFSPREAD-25NOV22UNMAFA-UNM13,0.2,0.29,New Mexico,13.5,spreads,2025-11-22 14:03:02 CST,New Mexico State Aggies,UTEP Miners,New Mexico State Aggies,0.5,13.5
2,KXNCAAFSPREAD-25NOV22UNMAFA-AFA3,0.26,0.35,Air Force,3.5,spreads,2025-11-22 18:00:00 CST,Air Force Falcons,Air Force Falcons,New Mexico Lobos,0.503867,3.5
3,KXNCAAFSPREAD-25NOV22JVSTFIU-JVST6,0.16,0.47,Jacksonville St,6.5,spreads,2025-11-22 14:33:17 CST,Jacksonville State Gamecocks,Florida International Panthers,Jacksonville State Gamecocks,0.484,6.5
4,KXNCAAFSPREAD-25NOV22JVSTFIU-JVST3,0.22,0.99,Jacksonville St,3.5,spreads,2025-11-22 14:33:17 CST,Jacksonville State Gamecocks,Florida International Panthers,Jacksonville State Gamecocks,0.5107,3.5
5,KXNCAAFSPREAD-25NOV22USUFRES-USU2,0.36,0.42,Utah St,2.5,spreads,2025-11-22 21:30:00 CST,Utah State Aggies,Fresno State Bulldogs,Utah State Aggies,0.4883,2.5
6,KXNCAAFSPREAD-25NOV22ASUCOLO-COLO6,0.13,0.23,Colorado,6.5,spreads,2025-11-22 19:00:00 CST,Colorado Buffaloes,Colorado Buffaloes,Arizona State Sun Devils,0.4856,6.5
7,KXNCAAFSPREAD-25NOV22BYUCIN-CIN2,0.36,0.4,Cincinnati,2.5,spreads,2025-11-22 19:00:00 CST,Cincinnati Bearcats,Cincinnati Bearcats,BYU Cougars,0.4892,2.5
8,KXNCAAFSPREAD-25NOV22TENNFLA-FLA3,0.29,0.35,Florida,3.5,spreads,2025-11-22 18:30:00 CST,Florida Gators,Florida Gators,Tennessee Volunteers,0.51815,3.5
9,KXNCAAFSPREAD-25NOV22CALSTAN-STAN3,0.24,0.33,Stanford,3.5,spreads,2025-11-22 18:30:00 CST,Stanford Cardinal,Stanford Cardinal,California Golden Bears,0.48505,3.5
