In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pathlib
import glob
import os
import sys
import re
sys.path.append('../nba-algo/')
from data_collection import get_season_lineups
from data_collection import match_past_raptor
from data_collection import isLatin
from unidecode import unidecode
import datetime
import math, statistics

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexstratton2121/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
def create_sequence(df, team):
    # Uses final_updated.csv
    team_df = df.loc[(df["Team"] == team) | (df["XXTeam"] == team)]
    cur = []
    for i, row in team_df.iterrows():
        if ((row["Team"] == team) & (row["Temp"] > row["XXTemp"])):
            cur.append(1)
        elif ((row["XXTeam"] == team) & (row["XXTemp"] > row["Temp"])):
            cur.append(1)
        else:
            cur.append(0)
        
    return cur

In [27]:
def get_runs(sequence):
    idx = 0
    run_count = 1
    while idx < (len(sequence) - 1):
        if sequence[idx] != sequence[idx+1]:
            run_count += 1
        idx += 1

    return run_count

In [20]:
def calc_var(df, cur_idx, home_away, stat):
    # home_away == "Team" or "XXTeam"
    team = df.iloc[cur_idx][home_away]
    # team_df = df.iloc[:cur_idx].loc[(df.iloc[:cur_idx]["Team"] == team) | (df.iloc[:cur_idx]["XXTeam"] == team)]
    team_df = df.iloc[:cur_idx].loc[df.iloc[:cur_idx][home_away] == team]
    if len(team_df) < 2:
        return None
    
    return statistics.variance(team_df[stat])

In [44]:
def ww_variance(win_count: int, loss_count: int) -> float:
    numerator = (2 * win_count * loss_count) * (
        ((2 * win_count * loss_count) - win_count - loss_count))
    denominator = ((win_count + loss_count) ** 2) * (win_count + loss_count - 1)

    try:
        return numerator / denominator
    except ZeroDivisionError:
        return -1

In [52]:
def ww_test(df, cur_idx, home_away):
    # home_away == "Team" or "XXTeam"
    sequence = create_sequence(df.iloc[:cur_idx], df.iloc[cur_idx][home_away])
    run_count = get_runs(sequence)
    win_count = sequence.count(1)
    loss_count = len(sequence) - win_count
    try:
        mean = ((2 * win_count * loss_count) / len(sequence)) + 1
    except ZeroDivisionError:
        return None
    
    variance = ww_variance(win_count, loss_count)
    if variance <= 0:
        return None
    
    z_score = (run_count - mean) / math.sqrt(variance)
    
    return z_score

In [128]:
# This function gives a ranking for a team's offense and defense when they are home and when they are away.

def stat_ranking(df, index, stat_type, season, off_def, home_away):
    column = list(df.columns).index(home_away)
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    team = data.iloc[index, column]
    previous_rows = data.iloc[:index]
    
    if off_def == 'off':
        reverse_val = True
    elif off_def == 'def':
        reverse_val = False
        
    averages = sorted([(item[stat_type].mean(), item[home_away].iloc[0]) for key, item in previous_rows.groupby([home_away])],
                      key=lambda x: x[0], reverse=reverse_val)
    try:
        return int([x[1] for x in averages].index(team) + 1)
    except:
        return None

In [4]:
# This function gives a ranking for a team's offense and defense including both home and away games.

def total_ranking(df, index, season, off_def, home_away):
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    previous_rows = data.iloc[:index]
    team_stats = [dict(stats=previous_rows[(previous_rows['HomeTeam'] == team) | (previous_rows['XXAwayTeam'] == team)],
                       team_name=team) for team in teams]
    current_team = data.iloc[index, home_away]
    
    if off_def == 'off':
        reverse_val = True
        stat_1, stat_2 = 'Points', 'XXPoints'
    elif off_def == 'def':
        reverse_val = False
        stat_1, stat_2 = 'XXPoints', 'Points'
      
    averages = []
    for club in team_stats:
        total_pts = 0
        for idx, row in club['stats'].iterrows():
            if row['HomeTeam'] == club['team_name']:
                total_pts += row[stat_1]
            elif row['XXAwayTeam'] == club['team_name']:
                total_pts += row[stat_2]
        
        try: 
            averages.append(dict(avg=total_pts/len(club['stats']), team_name=club['team_name']))
        except:
            return None
        
    averages = sorted(averages, key=lambda x: x['avg'], reverse=reverse_val)
    
    return [i for i, v in enumerate(averages) if current_team in v.values()][0] + 1

In [5]:
def sched_stren(df, index, home_away):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    subset = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
    # The next line gets all the games the team played against a top 10 defensive team and how many points they scored in those games.
    try:
        pts_for = pd.concat([subset[(subset['XXTotalDefRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalDefRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_for = pts_for.mean()
        games_for = subset.loc[pts_for.index.values]
        win_pct_for = len(games_for[((games_for['Team'] == team) & (games_for['W_L'] == 1)) | ((games_for['XXTeam'] == team) & (games_for['XXW_L'] == 1))]) / len(games_for)
        pts_against = pd.concat([subset[(subset['XXTotalOffRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalOffRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_against = pts_against.mean()
        games_against = subset.loc[pts_against.index.values]
        win_pct_against = len(games_against[((games_against['Team'] == team) & (games_against['W_L'] == 1)) | ((games_against['XXTeam'] == team) & (games_against['XXW_L'] == 1))]) / len(games_against)
        
        return [avg_pts_for, win_pct_for, avg_pts_against, win_pct_against]
    
    except:
        return [None, None, None, None]

In [6]:
def win_pct(df, index, home_away, quantile):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    try:
        if home_away == 4:
            opponent = df.iloc[index, 129]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['Team'] == team) & (games['W_L'] == 1)]) / len(games[games['Team'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        elif home_away == 129:
            opponent = df.iloc[index, 4]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['XXTeam'] == team) & (games['XXW_L'] == 1)]) / len(games[games['XXTeam'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        pt_dif_vs_opp = opp_games[opp_games['Team'] == team]['Temp'].sum() + opp_games[opp_games['XXTeam'] == team]['XXTemp'].sum() - games[games['Team'] == opponent]['Temp'].sum() - games[games['XXTeam'] == opponent]['XXTemp'].sum()
        l5 = games.iloc[-5:]
        away_games_l5 = len(l5[l5['XXTeam'] == team])
        pt_dif_vs_topx = games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['Temp'].sum() + games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['XXTemp'].sum() - games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['XXTemp'].sum() -  games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['Temp'].sum()
    
        return [win_pct, pt_dif_vs_opp, away_games_l5, pt_dif_vs_topx]
    except:
        return [None, None, None, None]

In [30]:
data_by_season[0]["Date"] = [datetime.datetime.strptime(x, "%Y-%m-%d") for x in data_by_season[0]["Date"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_by_season[0]["Date"] = [datetime.datetime.strptime(x, "%Y-%m-%d") for x in data_by_season[0]["Date"]]


In [108]:
def three_rating(row, home_away):
    if home_away == 1:
        fg3_val = row["NonHeaveFg3Pct"] * ((row["FG3A"] - row["HeaveAttempts"] + abs(std_val)) - avg_3_count)
        try:
            dist_val = 1 / (row["Avg3ptShotDistance"] * .10)
        except ZeroDivisionError:
            dist_val = 0
            
        c3_val = row["Corner3PctAssisted"] * row["Corner3FGA"]
    elif home_away == 0:
        fg3_val = row["XXNonHeaveFg3Pct"] * ((row["XXFG3A"] - row["XXHeaveAttempts"] + abs(std_val)) - avg_3_count)
        try:
            dist_val = 1 / (row["XXAvg3ptShotDistance"] * .10)
        except ZeroDivisionError:
            dist_val = 0
        c3_val = row["XXCorner3PctAssisted"] * row["XXCorner3FGA"]

    return fg3_val + dist_val + (c3_val * .5)

In [46]:
data_by_season[0]

Unnamed: 0,Date,GameID,Season,HomeAway,HomeTeam,OffPoss,Points,FG2M,FG2A,Fg2Pct,...,XXLostBallTurnovers,XXLostBallOutOfBoundsTurnovers,XXBadPassTurnovers,XXBadPassOutOfBoundsTurnovers,XXTravels,XX3SecondViolations,XXStepOutOfBoundsTurnovers,XXOffensiveGoaltends,3Rating,XX3Rating
15585,2013-10-29,2013-10-29CHIvMIA,13-14,1,MIA,97,107,26,52,0.500000,...,5,1,5,1,1,0,1,0,25.108497,15.883651
15586,2013-10-29,2013-10-29INDvORL,13-14,1,IND,94,97,27,54,0.500000,...,2,0,2,2,5,0,0,0,15.592066,20.044453
15587,2013-10-29,2013-10-29LACvLAL,13-14,1,LAL,99,116,28,64,0.437500,...,4,1,4,3,1,0,1,0,31.190257,18.792450
15588,2013-10-30,2013-10-30ATLvDAL,13-14,1,DAL,103,118,33,53,0.622642,...,2,2,5,2,2,0,1,0,24.510467,18.613650
15589,2013-10-30,2013-10-30BKNvCLE,13-14,1,CLE,92,98,30,69,0.434783,...,3,0,4,2,2,1,0,0,11.267557,20.676176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,2014-04-16,2014-04-16LALvSAS,13-14,1,SAS,102,100,31,69,0.449275,...,0,1,3,0,0,0,0,0,23.208831,24.702685
16811,2014-04-16,2014-04-16MIAvPHI,13-14,1,MIA,93,87,20,43,0.465116,...,3,4,5,1,1,1,0,0,23.798195,22.394850
16812,2014-04-16,2014-04-16MINvUTA,13-14,1,MIN,117,130,32,72,0.444444,...,6,1,6,0,0,0,1,0,18.408230,14.608511
16813,2014-04-16,2014-04-16NYKvTOR,13-14,1,NYK,93,95,29,64,0.453125,...,6,3,6,2,2,0,1,0,17.603226,19.308854


In [113]:
def fg3_ranking(df, index, season, off_def, home_away):
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    previous_rows = data.iloc[:index]
    team_stats = [dict(stats=previous_rows[(previous_rows['HomeTeam'] == team) | (previous_rows['XXAwayTeam'] == team)],
                       team_name=team) for team in teams]
    current_team = data.iloc[index, home_away]
    
    if off_def == 'off':
        reverse_val = True
        stat_1, stat_2 = "Fg3Rating", "XXFg3Rating"
    elif off_def == 'def':
        reverse_val = False
        stat_1, stat_2 = "XXFg3Rating", "Fg3Rating"
      
    averages = []
    for club in team_stats:
        total_rating = 0
        for idx, row in club['stats'].iterrows():
            if row['HomeTeam'] == club['team_name']:
                total_rating += row[stat_1]
            elif row['XXAwayTeam'] == club['team_name']:
                total_rating += row[stat_2]
        
        try: 
            averages.append(dict(avg=total_rating/len(club['stats']), team_name=club['team_name']))
        except:
            return None
        
    averages = sorted(averages, key=lambda x: x['avg'], reverse=reverse_val)
    
    return [i for i, v in enumerate(averages) if current_team in v.values()][0] + 1

In [133]:
def fg3_stren(df, index, home_away):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    subset = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
    # The next line gets all the games the team played against a top 10 defensive team and how many points they scored in those games.
    try:
        pts_for = pd.concat([subset[(subset['XXFg3DefRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['Fg3DefRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_for = pts_for.mean()
        games_for = subset.loc[pts_for.index.values]
        win_pct_for = len(games_for[((games_for['Team'] == team) & (games_for['W_L'] == 1)) | ((games_for['XXTeam'] == team) & (games_for['XXW_L'] == 1))]) / len(games_for)
        pts_against = pd.concat([subset[(subset['XXFg3OffRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['Fg3OffRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_against = pts_against.mean()
        games_against = subset.loc[pts_against.index.values]
        win_pct_against = len(games_against[((games_against['Team'] == team) & (games_against['W_L'] == 1)) | ((games_against['XXTeam'] == team) & (games_against['XXW_L'] == 1))]) / len(games_against)
        
        return [avg_pts_for, win_pct_for, avg_pts_against, win_pct_against]
    
    except:
        return [None, None, None, None]

In [246]:
def back2back(df, home_away):
    if home_away == "Team":
        col = -2
    else:
        col = -1
    vals = [0 for _ in range(len(df))]
    for i in range(1, len(df)):
        team = df.iloc[i][home_away]
        date = df.iloc[i]["Date"]
        idx = i - 1
        while idx != 0:
            if df.iloc[idx]["Team"] == team or df.iloc[idx]["XXTeam"] == team:
                  break
                    
            idx -= 1
            
        if (date - df.iloc[idx]["Date"]).days == 1 and idx != 0:
                vals[i] = 1
                
    return vals

In [123]:
data = pd.read_csv(str(Path.cwd()) + '/cleanData.csv')
data["Fg3Rating"] = [three_rating(row, 1) for _, row in data.iterrows()]
data["XXFg3Rating"] = [three_rating(row, 0) for _, row in data.iterrows()]
teams = data.loc[:, 'HomeTeam'].unique()
seasons = ['13-14', '14-15', '15-16', '16-17', '17-18', '18-19', '19-20', '20-21', '21-22']

In [158]:
subset = data.iloc[15585:, :]
avg_3_count = ((subset["FG3A"].mean() + subset["XXFG3A"].mean()) / 2) - ((subset["HeaveAttempts"].mean() + subset["XXHeaveAttempts"].mean()) / 2)
std_val = min([min([row["FG3A"] - row["HeaveAttempts"], row["XXFG3A"] - row["XXHeaveAttempts"]]) - avg_3_count for _, row in data.iterrows()])

In [294]:
final = pd.read_csv(str(Path.cwd()) + '/averagesFinal.csv')
final = final.assign(Temp=data['Points'], XXTemp=data['XXPoints'])
final["Date"] = [datetime.datetime.strptime(x, "%Y-%m-%d") for x in final["Date"]]
final["Fg3Rating"] = [three_rating(row, 1) for _, row in final.iterrows()]
final["XXFg3Rating"] = [three_rating(row, 0) for _, row in final.iterrows()]
final['Order'] = final.index

In [295]:
elev_data = pd.read_excel("nba_elevation.xlsx")
final = pd.merge(final, elev_data, how="inner", on="Team")
final = final.sort_values(by="Order")
final.set_index("Order", drop=True, inplace=True)
elev_dif = []
for i, row in final.iterrows():
    home_elev = elev_data[elev_data["Team"] == row["Team"]]["Elevation"].item()
    away_elev = elev_data[elev_data["Team"] == row["XXTeam"]]["Elevation"].item()
    elev_dif.append(home_elev - away_elev)

final = final.assign(ElevDif=elev_dif)
final.drop("Elevation", axis=1, inplace=True)

In [290]:
elev_data[elev_data["Team"] == home]["Elevation"].item()

4265

In [296]:
final

Unnamed: 0_level_0,Date,GameID,Season,H/A,Team,Wins,WinLast10,Elo,EloLast10,OffPoss,...,XXBadPassOutOfBoundsTurnovers,XXTravels,XX3SecondViolations,XXStepOutOfBoundsTurnovers,XXOffensiveGoaltends,Temp,XXTemp,Fg3Rating,XXFg3Rating,XXElevDif
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2000-10-31,2000-10-31ATLvCHA,00-01,1,ATL,0,0,1367,0,0.0,...,0.0,0.0,0.0,0.0,0.0,82,106,0.000000,0.000000,827
1,2000-10-31,2000-10-31BKNvCLE,00-01,1,BKN,0,0,1446,0,0.0,...,0.0,0.0,0.0,0.0,0.0,82,86,0.000000,0.000000,-619
2,2000-10-31,2000-10-31CHIvSAC,00-01,1,CHI,0,0,1327,0,0.0,...,0.0,0.0,0.0,0.0,0.0,81,100,0.000000,0.000000,564
3,2000-10-31,2000-10-31DALvMIL,00-01,1,DAL,0,0,1541,0,0.0,...,0.0,0.0,0.0,0.0,0.0,97,93,0.000000,0.000000,-190
4,2000-10-31,2000-10-31DETvTOR,00-01,1,TOR,0,0,1487,0,0.0,...,0.0,0.0,0.0,0.0,0.0,95,104,0.000000,0.000000,-397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26329,2022-04-10,2022-04-10LACvOKC,21-22,1,LAC,41,5,1525,34,97.6,...,1.5,1.1,0.0,0.6,0.0,138,88,18.171025,18.611936,-958
26330,2022-04-10,2022-04-10MIAvORL,21-22,1,ORL,21,3,1288,-26,99.8,...,1.5,0.6,0.1,0.6,0.1,125,111,16.553572,19.552197,98
26331,2022-04-10,2022-04-10NYKvTOR,21-22,1,NYK,36,6,1500,18,94.8,...,1.0,0.5,0.0,0.2,0.0,105,94,20.705304,18.036349,-223
26332,2022-04-10,2022-04-10PHXvSAC,21-22,1,PHX,64,7,1690,-35,101.7,...,1.8,0.3,0.2,0.1,0.0,109,116,14.963831,14.041859,1066


In [161]:
final_by_season = [final[final['Season'] == season] for season in seasons]
data_by_season = [data[data['Season'] == season] for season in seasons]

In [None]:
for i in range(len(final_by_season)):
    homeb2b = back2back(final_by_season[i], "Team")
    awayb2b = back2back(final_by_season[i], "XXTeam")
    final_by_season[i] = final_by_season[i].assign(Back2Back=homeb2b, XXBack2Back=awayb2b)

In [129]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking when at home
    # Adds home team's defense ranking while away
    # Adds away team's offense ranking when at home
    # Adds away team's defense ranking while away
    final_by_season[idx] = df.assign(OffRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'off', 'HomeTeam') for index in range(len(df))],
                                    DefRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'def', 'HomeTeam') for index in range(len(df))],
                                    XXOffRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'off', 'XXAwayTeam') for index in range(len(df))],
                                    XXDefRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'def', 'XXAwayTeam') for index in range(len(df))])
    
    final_by_season[idx]['W_L'] = np.where(data_by_season[idx]['Points'] > data_by_season[idx]['XXPoints'], 1, 0)
    final_by_season[idx]['XXW_L'] = np.where(data_by_season[idx]['Points'] < data_by_season[idx]['XXPoints'], 1, 0)

In [130]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offensive 3pt ranking
    # Adds home team's defensive 3pt ranking
    # Adds away team's offensive 3pt ranking
    # Adds away team's defensive 3pt ranking
    final_by_season[idx] = df.assign(Fg3OffRank=[fg3_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    Fg3DefRank=[fg3_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    XXFg3OffRank=[fg3_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('XXAwayTeam')) for index in range(len(df))],
                                    XXFg3DefRank=[fg3_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('XXAwayTeam')) for index in range(len(df))])
    
    final_by_season[idx] = final_by_season[idx].drop(["Fg3Rating", "XXFg3Rating"], axis=1)

In [137]:
# Adds home team's average points scored vs top 10 fg3 defenses
# Adds home team's average points allowed vs top 10 fg3 offenses
# Adds away team's average points scored vs top 10 fg3 defenses
# Adds away team's average points allowed vs top 10 fg3 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        PtsScoredTop10Fg3Def=[fg3_stren(
            df, index, list(df.columns).index('Team'))[0] for index in range(len(df))],
        PtsAllowedTop10Fg3Off=[fg3_stren(
            df, index, list(df.columns).index('Team'))[2] for index in range(len(df))],
        XXPtsScoredTop10Fg3Def=[fg3_stren(
            df, index, list(df.columns).index('XXTeam'))[0] for index in range(len(df))],
        XXPtsAllowedTop10Fg3Off=[fg3_stren(
            df, index, list(df.columns).index('XXTeam'))[2] for index in range(len(df))])
    
# Adds home team's W% vs top 10 fg3 defenses
# Adds home team's W% vs top 10 fg3 offenses
# Adds away team's W% vs top 10 fg3 defenses
# Adds away team's W% vs top 10 fg3 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        WinPctTop10Fg3Def=[fg3_stren(
            df, index, list(df.columns).index('Team'))[1] for index in range(len(df))],
        WinPctTop10Fg3Off=[fg3_stren(
            df, index, list(df.columns).index('Team'))[3] for index in range(len(df))],
        XXWinPctTop10Fg3Def=[fg3_stren(
            df, index, list(df.columns).index('XXTeam'))[1] for index in range(len(df))],
        XXWinPctTop10Fg3Off=[fg3_stren(
            df, index, list(df.columns).index('XXTeam'))[3] for index in range(len(df))])

In [None]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking
    # Adds home team's defense ranking
    # Adds away team's offense ranking
    # Adds away team's defense ranking
    final_by_season[idx] = df.assign(TotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    TotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    XXTotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('XXAwayTeam')) for index in range(len(df))],
                                    XXTotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('XXAwayTeam')) for index in range(len(df))])

In [12]:
# Adds home team's average points scored vs top 10 defenses
# Adds home team's average points allowed vs top 10 offenses
# Adds away team's average points scored vs top 10 defenses
# Adds away team's average points allowed vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        PtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[0] for index in range(len(df))],
        PtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[2] for index in range(len(df))],
        XXPtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[0] for index in range(len(df))],
        XXPtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[2] for index in range(len(df))])

In [13]:
# Adds home team's W% vs top 10 defenses
# Adds home team's W% vs top 10 offenses
# Adds away team's W% vs top 10 defenses
# Adds away team's W% vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        WinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[1] for index in range(len(df))],
        WinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[3] for index in range(len(df))],
        XXWinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[1] for index in range(len(df))],
        XXWinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[3] for index in range(len(df))])

In [14]:
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        HomeWinPct = [win_pct(df, index, list(df.columns).index('Team'), .5)[0] for index in range(len(df))],
        XXAwayWinPct = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[0] for index in range(len(df))],
        PtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('Team'), .5)[1] for index in range(len(df))],
        XXPtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[1] for index in range(len(df))],
        NumAwayLast5 = [win_pct(df, index, list(df.columns).index('Team'), .5)[2] for index in range(len(df))],
        XXNumAwayLast5 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[2] for index in range(len(df))],
        PtDifvsTop50 = [win_pct(df, index, list(df.columns).index('Team'), .5)[3] for index in range(len(df))],
        XXPtDifvsTop50 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[3] for index in range(len(df))], 
        PtDifvsTop75 = [win_pct(df, index, list(df.columns).index('Team'), .75)[3] for index in range(len(df))],
        XXPtDifvsTop75 = [win_pct(df, index, list(df.columns).index('XXTeam'), .75)[3] for index in range(len(df))])

In [15]:
final_df = pd.concat(final_by_season, axis=0)
final_df.to_csv(str(Path.cwd()) + '/final_updated.csv')

In [16]:
# This function finds the rating for each player.
def get_value(row, df):
    if len(df.loc[(df['player_name'] == row['Starters']) & (df['season'] == row['FG%']) & (df['team'] == row['FGA']), 'rating']) == 0:
        return -1
    else:
        return df.loc[(df['player_name'] == row['Starters']) & (df['season'] == row['FG%']) & (df['team'] == row['FGA']), 'rating'].item()

In [17]:
# This function sums the adjusted raptor rating for the 5 highest rated players who played in a particular game.
def adj_raptor(row, lineups, home_away):
    ratings = list(lineups.loc[(lineups['FG'] == row[home_away]) & (lineups['FTA'] != ('Did Not Dress' or 'Did Not Play' or 'Not With Team' or 'Player Suspended')), 'MP'])
    return sum(sorted(ratings, reverse=True)[:5])

In [18]:
# Reading the data in here because the chunks above take a long time to run. I needed to find the running average \
# for each game which requires looping over every row.
final = pd.read_csv(str(Path.cwd()) + '/final_updated.csv')
final = final.drop(['Unnamed: 0'], axis=1)
final.loc[final.Team == 'PHX', 'Team'] = 'PHO'
final.loc[final.Team == 'BKN', 'Team'] = 'BRK'
final.loc[final.XXTeam == 'BKN', 'XXTeam'] = 'BRK'
final.loc[final.XXTeam == 'PHX', 'XXTeam'] = 'PHO'
final['TeamID'] = final['Date'].str.cat(final['Team'])
final['XXTeamID'] = final['Date'].str.cat(final['XXTeam'])

In [56]:
seasons = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
season_lineups = [get_season_lineups(season) for season in seasons]

[WDM] - Downloading: 100%|█████████████████| 8.29M/8.29M [00:01<00:00, 6.86MB/s]


In [59]:
for i, season in enumerate(season_lineups):
    for idx, lineup in enumerate(season):
        season[idx] = lineup.assign(Age=seasons[i])
        
lineups = [lineup for season in season_lineups for lineup in season]
lineups = pd.concat(lineups, axis=0)
lineups["Unnamed: 1"] = lineups['Unnamed: 1'].str.replace('[^\w\s]','', regex=True)
lineups['Unnamed: 1'] = lineups['Unnamed: 1'].apply(unidecode)
lineups.loc[lineups['Rk'] == 'CHO', 'Rk'] = 'CHA'

KeyError: 'Unnamed: 1'

In [62]:
lineups.to_excel("lineup_data_all.xlsx")

In [21]:
raptor = pd.read_csv(str(Path.cwd()) + '/modern_RAPTOR_by_team.csv')
raptor = raptor.loc[raptor['season_type'] == 'RS']
# Here I add the absolute value of the lowest raptor rating to every rating to make them all positive.
# This is so I can account for some players who played very few minutes having very high ratings.
ratings = pd.DataFrame(data={'player_name': raptor['player_name'], 'season': raptor['season'], 'team': raptor['team'], 'rating': (raptor['raptor_total'] + abs(min(raptor['raptor_total']))) * raptor['mp']})
# I divide by number of games a player played and not total number of games for their team so that players' ratings \
# are not determined by how healthy they were in a particular season.
ratings['rating'] = ratings.apply(
    lambda row: row['rating'] / lineups.loc[(lineups['Unnamed: 1'] == row['player_name']) & (lineups['Age'] == row['season']) & (lineups['Rk'] == row['team']), 'G'].item() if len(lineups.loc[(lineups['Unnamed: 1'] == row['player_name']) & (lineups['Age'] == row['season']) & (lineups['Rk'] == row['team']), 'G']) != 0 else None, axis=1)

In [32]:
data_path = str(Path.cwd()) + '/lineups'
files = sorted(pathlib.Path(data_path).glob('*.xlsx'))
data_list = [pd.read_excel(str(file)) for file in files]

In [41]:
# data_path = str(Path.cwd()) + '/lineups'
# files = sorted(pathlib.Path(data_path).glob('*.xlsx'))
# data_list = [pd.read_excel(str(file)) for file in files]
data_list[0].columns, data_list[1].columns, data_list[8].columns = data_list[2].columns, data_list[2].columns, data_list[2].columns
data_list[0].drop([0, 1], axis=0, inplace=True)
data_list[8].drop([0, 1], axis=0, inplace=True)
# Here I format the game lineup data so that I can match each players rating and then assign that rating.
for idx, df in enumerate(data_list):
    data_list[idx].reset_index(drop=True, inplace=True)
    data_list[idx].drop(['Unnamed: 0'], axis=1, inplace=True)
    data_list[idx]['Starters'] = data_list[idx]['Starters'].str.replace('[^\w\s]','', regex=True)
    data_list[idx]['Starters'] = data_list[idx]['Starters'].apply(unidecode)
    data_list[idx]['FG'] = data_list[idx]['FG'].str.replace(r'CHO', 'CHA', regex=True)
    data_list[idx]['FG%'] = seasons[idx]
    data_list[idx]['MP'] = data_list[idx].apply(lambda row: get_value(row, ratings), axis=1)

In [42]:
lineups = pd.concat(data_list, axis=0)
final['AdjRaptor'] = final.apply(lambda row: adj_raptor(row, lineups, 'TeamID'), axis=1)
final['XXAdjRaptor'] = final.apply(lambda row: adj_raptor(row, lineups, 'XXTeamID'), axis=1)

In [45]:
final.to_csv(str(Path.cwd()) + '/final_updated.csv')

In [7]:
data = pd.read_csv("final_updated.csv")
data = [data.loc[data["Season"] == season] for season in data.Season.unique()]

In [12]:
def assign_conf(row, home_away):
    WEST = ["LAL", "DAL", "HOU", "SAC", "GSW", "NOP", "SAS", "MIN", "UTA", "PHO", "LAC", "DEN",
            "MEM", "POR", "OKC"]
    EAST = [team for team in teams if team not in WEST]
    
    if row[home_away] in WEST:
        return 'W'
    else:
        return 'E'

In [54]:
for idx, season in enumerate(data):
    data[idx] = season.assign(
        WW_ZScore=[ww_test(
            season, index, "Team") for index in range(len(season))],
        XXWW_ZScore=[ww_test(
            season, index, "XXTeam") for index in range(len(season))])

In [26]:
# Variance of the home team's ppg at home
# Variance of away team's ppg while away
for idx, season in enumerate(data):
    data[idx] = season.assign(PointsVar = [calc_var(season, idx, "Team", "Temp") for idx in range(len(season))],
                             XXPointsVar = [calc_var(season, idx, "XXTeam", "XXTemp") for idx in range(len(season))],
                             PointsAgstVar = [calc_var(season, idx, "Team", "XXTemp") for idx in range(len(season))],
                             XXPointsAgstVar = [calc_var(season, idx, "XXTeam", "Temp") for idx in range(len(season))])

In [46]:
 for idx, season in enumerate(data):
    teams = season.Team.unique()
    data[idx] = season.assign(Conference = [assign_conf(season.iloc[i], "Team") for i in range(len(season))],
                              XXConference = [assign_conf(season.iloc[i], "XXTeam") for i in range(len(season))])
    data[idx] = season.assign(Seed = [calc_records(season, i, "Team", "Conference") for i in range(len(season))],
                              XXSeed = [calc_records(season, i, "XXTeam", "XXConference") for i in range(len(season))])

In [45]:
def calc_records(data, index, home_away, conf):
    df = data.iloc[:index]
    results = {}
    for team in data.Team.unique():
        wins = 0
        team_df = df[(df["Team"] == team) | (df["XXTeam"] == team)]
        if len(team_df) == 0:
            return None

        cur = team_df.iloc[-1]
        results[team] = cur.Wins
        
    WEST = ["LAL", "DAL", "HOU", "SAC", "GSW", "NOP", "SAS", "MIN", "UTA", "PHO", "LAC", "DEN",
            "MEM", "POR", "OKC"]

    cur_conf = data.iloc[index][conf]
    if cur_conf == 'W':
        west_records = sorted([(k, itm) for k, itm in results.items() if k in WEST], key=lambda x: x[1], reverse=True)
        return [x[0] for x in west_records].index(data.iloc[index][home_away])
    
    EAST = [team for team in data.Team.unique() if team not in WEST]
    east_records = sorted([(k, itm) for k, itm in results.items() if k in EAST], key=lambda x: x[1], reverse=True)
    return [x[0] for x in east_records].index(data.iloc[index][home_away])

In [48]:
data[0]

Unnamed: 0,Date,GameID,Season,H/A,Team,ML,Wins,WinLast10,Elo,EloLast10,...,PtDifvsTop75,XXPtDifvsTop75,TeamID,XXTeamID,AdjRaptor,XXAdjRaptor,Conference,XXConference,Seed,XXSeed
0,2013-10-29,2013-10-29CHIvMIA,13-14,1,MIA,-220,0,0,1681,0,...,,,2013-10-29MIA,2013-10-29CHI,13249.874611,14407.776447,E,E,,
1,2013-10-29,2013-10-29INDvORL,13-14,1,IND,-1025,0,0,1568,0,...,,,2013-10-29IND,2013-10-29ORL,13712.995591,12779.554394,E,E,,
2,2013-10-29,2013-10-29LACvLAL,13-14,1,LAL,435,0,0,1547,0,...,,,2013-10-29LAL,2013-10-29LAC,12335.559495,14003.297345,W,W,,
3,2013-10-30,2013-10-30ATLvDAL,13-14,1,DAL,-245,0,0,1529,0,...,,,2013-10-30DAL,2013-10-30ATL,12844.096168,13650.735892,W,E,,
4,2013-10-30,2013-10-30BKNvCLE,13-14,1,CLE,135,0,0,1368,0,...,,,2013-10-30CLE,2013-10-30BRK,12285.246269,12471.903115,E,E,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2014-04-16,2014-04-16LALvSAS,13-14,1,SAS,-465,62,7,1713,3,...,59.0,-356.0,2014-04-16SAS,2014-04-16LAL,11040.539531,11713.812964,W,W,1.0,13.0
1226,2014-04-16,2014-04-16MIAvPHI,13-14,1,MIA,-243,54,5,1612,-32,...,7.0,-315.0,2014-04-16MIA,2014-04-16PHI,11014.322493,10810.614231,E,E,3.0,12.0
1227,2014-04-16,2014-04-16MINvUTA,13-14,1,MIN,-900,40,4,1542,-10,...,-143.0,-243.0,2014-04-16MIN,2014-04-16UTA,12729.233203,12392.574617,W,W,6.0,14.0
1228,2014-04-16,2014-04-16NYKvTOR,13-14,1,NYK,210,36,7,1521,63,...,-169.0,-65.0,2014-04-16NYK,2014-04-16TOR,9633.580100,13069.222983,E,E,4.0,3.0


In [49]:
def games_remaining(df, index, home_away):
    team = df.iloc[index][home_away]
    data = df.iloc[:index]
    team_df = data[(data["Team"] == team) | (data["XXTeam"] == team)]
    gp = len(team_df)
    return len(df[(df["Team"] == team) | (df["XXTeam"] == team)]) - gp

In [50]:
 for idx, season in enumerate(data):
    data[idx] = season.assign(GRemaining = [games_remaining(season, i, "Team") for i in range(len(season))],
                              XXGRemaining = [games_remaining(season, i, "XXTeam") for i in range(len(season))])

In [51]:
data[0]

Unnamed: 0,Date,GameID,Season,H/A,Team,ML,Wins,WinLast10,Elo,EloLast10,...,TeamID,XXTeamID,AdjRaptor,XXAdjRaptor,Conference,XXConference,Seed,XXSeed,GRemaining,XXGRemaining
0,2013-10-29,2013-10-29CHIvMIA,13-14,1,MIA,-220,0,0,1681,0,...,2013-10-29MIA,2013-10-29CHI,13249.874611,14407.776447,E,E,,,82,82
1,2013-10-29,2013-10-29INDvORL,13-14,1,IND,-1025,0,0,1568,0,...,2013-10-29IND,2013-10-29ORL,13712.995591,12779.554394,E,E,,,82,82
2,2013-10-29,2013-10-29LACvLAL,13-14,1,LAL,435,0,0,1547,0,...,2013-10-29LAL,2013-10-29LAC,12335.559495,14003.297345,W,W,,,82,82
3,2013-10-30,2013-10-30ATLvDAL,13-14,1,DAL,-245,0,0,1529,0,...,2013-10-30DAL,2013-10-30ATL,12844.096168,13650.735892,W,E,,,82,82
4,2013-10-30,2013-10-30BKNvCLE,13-14,1,CLE,135,0,0,1368,0,...,2013-10-30CLE,2013-10-30BRK,12285.246269,12471.903115,E,E,,,82,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,2014-04-16,2014-04-16LALvSAS,13-14,1,SAS,-465,62,7,1713,3,...,2014-04-16SAS,2014-04-16LAL,11040.539531,11713.812964,W,W,1.0,13.0,1,1
1226,2014-04-16,2014-04-16MIAvPHI,13-14,1,MIA,-243,54,5,1612,-32,...,2014-04-16MIA,2014-04-16PHI,11014.322493,10810.614231,E,E,3.0,12.0,1,1
1227,2014-04-16,2014-04-16MINvUTA,13-14,1,MIN,-900,40,4,1542,-10,...,2014-04-16MIN,2014-04-16UTA,12729.233203,12392.574617,W,W,6.0,14.0,1,1
1228,2014-04-16,2014-04-16NYKvTOR,13-14,1,NYK,210,36,7,1521,63,...,2014-04-16NYK,2014-04-16TOR,9633.580100,13069.222983,E,E,4.0,3.0,1,1
