In [2]:
import pandas as pd
import numpy as np

In [3]:
# This function gives a ranking for a team's offense and defense when they are home and when they are away.

def stat_ranking(df, index, stat_type, season, off_def, home_away):
    column = list(df.columns).index(home_away)
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    team = data.iloc[index, column]
    previous_rows = data.iloc[:index]
    
    if off_def == 'off':
        reverse_val = True
    elif off_def == 'def':
        reverse_val = False
        
    averages = sorted([(item[stat_type].mean(), item[home_away].iloc[0]) for key, item in previous_rows.groupby([home_away])],
                      key=lambda x: x[0], reverse=reverse_val)
    try:
        return int([x[1] for x in averages].index(team) + 1)
    except:
        return None

In [4]:
# This function gives a ranking for a team's offense and defense including both home and away games.

def total_ranking(df, index, season, off_def, home_away):
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    previous_rows = data.iloc[:index]
    team_stats = [dict(stats=previous_rows[(previous_rows['HomeTeam'] == team) | (previous_rows['XXAwayTeam'] == team)],
                       team_name=team) for team in teams]
    current_team = data.iloc[index, home_away]
    
    if off_def == 'off':
        reverse_val = True
        stat_1, stat_2 = 'Points', 'XXPoints'
    elif off_def == 'def':
        reverse_val = False
        stat_1, stat_2 = 'XXPoints', 'Points'
      
    averages = []
    for club in team_stats:
        total_pts = 0
        for idx, row in club['stats'].iterrows():
            if row['HomeTeam'] == club['team_name']:
                total_pts += row[stat_1]
            elif row['XXAwayTeam'] == club['team_name']:
                total_pts += row[stat_2]
        
        try: 
            averages.append(dict(avg=total_pts/len(club['stats']), team_name=club['team_name']))
        except:
            return None
        
    averages = sorted(averages, key=lambda x: x['avg'], reverse=reverse_val)
    
    return [i for i, v in enumerate(averages) if current_team in v.values()][0] + 1

In [5]:
def sched_stren(df, index, home_away):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    subset = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
    # The next line gets all the games the team played against a top 10 defensive team and how many points they scored in those games.
    try:
        pts_for = pd.concat([subset[(subset['XXTotalDefRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalDefRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_for = pts_for.mean()
        games_for = subset.loc[pts_for.index.values]
        win_pct_for = len(games_for[((games_for['Team'] == team) & (games_for['W_L'] == 1)) | ((games_for['XXTeam'] == team) & (games_for['XXW_L'] == 1))]) / len(games_for)
        pts_against = pd.concat([subset[(subset['XXTotalOffRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalOffRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_against = pts_against.mean()
        games_against = subset.loc[pts_against.index.values]
        win_pct_against = len(games_against[((games_against['Team'] == team) & (games_against['W_L'] == 1)) | ((games_against['XXTeam'] == team) & (games_against['XXW_L'] == 1))]) / len(games_against)
        
        return [avg_pts_for, win_pct_for, avg_pts_against, win_pct_against]
    
    except:
        return [None, None, None, None]

In [6]:
def win_pct(df, index, home_away, quantile):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    try:
        if home_away == 4:
            opponent = df.iloc[index, 129]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['Team'] == team) & (games['W_L'] == 1)]) / len(games[games['Team'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        elif home_away == 129:
            opponent = df.iloc[index, 4]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['XXTeam'] == team) & (games['XXW_L'] == 1)]) / len(games[games['XXTeam'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        pt_dif_vs_opp = opp_games[opp_games['Team'] == team]['Temp'].sum() + opp_games[opp_games['XXTeam'] == team]['XXTemp'].sum() - games[games['Team'] == opponent]['Temp'].sum() - games[games['XXTeam'] == opponent]['XXTemp'].sum()
        l5 = games.iloc[-5:]
        away_games_l5 = len(l5[l5['XXTeam'] == team])
        pt_dif_vs_topx = games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['Temp'].sum() + games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['XXTemp'].sum() - games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['XXTemp'].sum() -  games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['Temp'].sum()
    
        return [win_pct, pt_dif_vs_opp, away_games_l5, pt_dif_vs_topx]
    except:
        return [None, None, None, None]

In [7]:
data = pd.read_csv(r'/Users/User/Desktop/lineup_data/cleanData.csv')
teams = data.loc[:, 'HomeTeam'].unique()
seasons = ['13-14', '14-15', '15-16', '16-17', '17-18', '18-19', '19-20', '21-22']

In [8]:
final = pd.read_csv(r'/Users/User/Desktop/lineup_data/averagesFinal.csv')
final = final.assign(Temp=data['Points'], XXTemp=data['XXPoints'])

In [9]:
final_by_season = [final[final['Season'] == season] for season in seasons]
data_by_season = [data[data['Season'] == season] for season in seasons]

In [10]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking when at home
    # Adds home team's defense ranking while away
    # Adds away team's offense ranking when at home
    # Adds away team's defense ranking while away
    final_by_season[idx] = df.assign(OffRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'off', 'HomeTeam') for index in range(len(df))],
                                    DefRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'def', 'HomeTeam') for index in range(len(df))],
                                    XXOffRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'off', 'XXAwayTeam') for index in range(len(df))],
                                    XXDefRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'def', 'XXAwayTeam') for index in range(len(df))])
    
    final_by_season[idx]['W_L'] = np.where(data_by_season[idx]['Points'] > data_by_season[idx]['XXPoints'], 1, 0)
    final_by_season[idx]['XXW_L'] = np.where(data_by_season[idx]['Points'] < data_by_season[idx]['XXPoints'], 1, 0)

In [11]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking
    # Adds home team's defense ranking
    # Adds away team's offense ranking
    # Adds away team's defense ranking
    final_by_season[idx] = df.assign(TotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    TotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    XXTotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('XXAwayTeam')) for index in range(len(df))],
                                    XXTotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('XXAwayTeam')) for index in range(len(df))])

In [12]:
# Adds home team's average points scored vs top 10 defenses
# Adds home team's average points allowed vs top 10 offenses
# Adds away team's average points scored vs top 10 defenses
# Adds away team's average points allowed vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        PtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[0] for index in range(len(df))],
        PtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[2] for index in range(len(df))],
        XXPtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[0] for index in range(len(df))],
        XXPtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[2] for index in range(len(df))])

In [13]:
# Adds home team's W% vs top 10 defenses
# Adds home team's W% vs top 10 offenses
# Adds away team's W% vs top 10 defenses
# Adds away team's W% vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        WinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[1] for index in range(len(df))],
        WinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[3] for index in range(len(df))],
        XXWinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[1] for index in range(len(df))],
        XXWinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[3] for index in range(len(df))])

In [14]:
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        HomeWinPct = [win_pct(df, index, list(df.columns).index('Team'), .5)[0] for index in range(len(df))],
        XXAwayWinPct = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[0] for index in range(len(df))],
        PtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('Team'), .5)[1] for index in range(len(df))],
        XXPtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[1] for index in range(len(df))],
        NumAwayLast5 = [win_pct(df, index, list(df.columns).index('Team'), .5)[2] for index in range(len(df))],
        XXNumAwayLast5 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[2] for index in range(len(df))],
        PtDifvsTop50 = [win_pct(df, index, list(df.columns).index('Team'), .5)[3] for index in range(len(df))],
        XXPtDifvsTop50 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[3] for index in range(len(df))], 
        PtDifvsTop75 = [win_pct(df, index, list(df.columns).index('Team'), .75)[3] for index in range(len(df))],
        XXPtDifvsTop75 = [win_pct(df, index, list(df.columns).index('XXTeam'), .75)[3] for index in range(len(df))])

In [15]:
final_df = pd.concat(final_by_season, axis=0)

In [16]:
final_df

Unnamed: 0,Date,GameID,Season,H/A,Team,Wins,WinLast10,Elo,EloLast10,OffPoss,...,HomeWinPct,XXAwayWinPct,PtDifvsOppTeam,XXPtDifvsOppTeam,NumAwayLast5,XXNumAwayLast5,PtDifvsTop50,XXPtDifvsTop50,PtDifvsTop75,XXPtDifvsTop75
15585,2013-10-29,2013-10-29CHIvMIA,13-14,1,MIA,0,0,1681,0,0.0,...,,,,,,,,,,
15586,2013-10-29,2013-10-29INDvORL,13-14,1,IND,0,0,1568,0,0.0,...,,,,,,,,,,
15587,2013-10-29,2013-10-29LACvLAL,13-14,1,LAL,0,0,1547,0,0.0,...,,,,,,,,,,
15588,2013-10-30,2013-10-30ATLvDAL,13-14,1,DAL,0,0,1529,0,0.0,...,,,,,,,,,,
15589,2013-10-30,2013-10-30BKNvCLE,13-14,1,CLE,0,0,1368,0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26329,2022-04-10,2022-04-10LACvOKC,21-22,1,LAC,41,5,1525,34,97.6,...,0.600,0.300,4.0,-4.0,2.0,2.0,-108.0,-352.0,-126.0,-233.0
26330,2022-04-10,2022-04-10MIAvORL,21-22,1,ORL,21,3,1288,-26,99.8,...,0.275,0.600,-37.0,37.0,2.0,3.0,-425.0,135.0,-207.0,2.0
26331,2022-04-10,2022-04-10NYKvTOR,21-22,1,NYK,36,6,1500,18,94.8,...,0.400,0.600,-27.0,27.0,2.0,1.0,-110.0,21.0,-65.0,61.0
26332,2022-04-10,2022-04-10PHXvSAC,21-22,1,PHX,64,7,1690,-35,101.7,...,0.800,0.325,5.0,-5.0,4.0,3.0,128.0,-342.0,85.0,-177.0


In [17]:
final_df.to_csv(r'/Users/User/Desktop/lineup_data/final_updated.csv')