In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import pathlib
import glob
import os
import sys
import re
sys.path.append('../Data_Collection_Cleaning/')
from data_collection import get_season_lineups
from data_collection import match_past_raptor
from data_collection import isLatin
from unidecode import unidecode

In [2]:
# This function gives a ranking for a team's offense and defense when they are home and when they are away.

def stat_ranking(df, index, stat_type, season, off_def, home_away):
    column = list(df.columns).index(home_away)
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    team = data.iloc[index, column]
    previous_rows = data.iloc[:index]
    
    if off_def == 'off':
        reverse_val = True
    elif off_def == 'def':
        reverse_val = False
        
    averages = sorted([(item[stat_type].mean(), item[home_away].iloc[0]) for key, item in previous_rows.groupby([home_away])],
                      key=lambda x: x[0], reverse=reverse_val)
    try:
        return int([x[1] for x in averages].index(team) + 1)
    except:
        return None

In [3]:
# This function gives a ranking for a team's offense and defense including both home and away games.

def total_ranking(df, index, season, off_def, home_away):
    data = df[df['Season'] == season]
    data.reset_index(inplace=True, drop=True)
    previous_rows = data.iloc[:index]
    team_stats = [dict(stats=previous_rows[(previous_rows['HomeTeam'] == team) | (previous_rows['XXAwayTeam'] == team)],
                       team_name=team) for team in teams]
    current_team = data.iloc[index, home_away]
    
    if off_def == 'off':
        reverse_val = True
        stat_1, stat_2 = 'Points', 'XXPoints'
    elif off_def == 'def':
        reverse_val = False
        stat_1, stat_2 = 'XXPoints', 'Points'
      
    averages = []
    for club in team_stats:
        total_pts = 0
        for idx, row in club['stats'].iterrows():
            if row['HomeTeam'] == club['team_name']:
                total_pts += row[stat_1]
            elif row['XXAwayTeam'] == club['team_name']:
                total_pts += row[stat_2]
        
        try: 
            averages.append(dict(avg=total_pts/len(club['stats']), team_name=club['team_name']))
        except:
            return None
        
    averages = sorted(averages, key=lambda x: x['avg'], reverse=reverse_val)
    
    return [i for i, v in enumerate(averages) if current_team in v.values()][0] + 1

In [4]:
def sched_stren(df, index, home_away):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    subset = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
    # The next line gets all the games the team played against a top 10 defensive team and how many points they scored in those games.
    try:
        pts_for = pd.concat([subset[(subset['XXTotalDefRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalDefRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_for = pts_for.mean()
        games_for = subset.loc[pts_for.index.values]
        win_pct_for = len(games_for[((games_for['Team'] == team) & (games_for['W_L'] == 1)) | ((games_for['XXTeam'] == team) & (games_for['XXW_L'] == 1))]) / len(games_for)
        pts_against = pd.concat([subset[(subset['XXTotalOffRank'] <= 10) & (subset['Team'] == team)]['Temp'], subset[(subset['TotalOffRank'] <= 10) & (subset['XXTeam'] == team)]['XXTemp']])
        avg_pts_against = pts_against.mean()
        games_against = subset.loc[pts_against.index.values]
        win_pct_against = len(games_against[((games_against['Team'] == team) & (games_against['W_L'] == 1)) | ((games_against['XXTeam'] == team) & (games_against['XXW_L'] == 1))]) / len(games_against)
        
        return [avg_pts_for, win_pct_for, avg_pts_against, win_pct_against]
    
    except:
        return [None, None, None, None]

In [5]:
def win_pct(df, index, home_away, quantile):
    team = df.iloc[index, home_away]
    subset = df.iloc[:index]
    try:
        if home_away == 4:
            opponent = df.iloc[index, 129]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['Team'] == team) & (games['W_L'] == 1)]) / len(games[games['Team'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        elif home_away == 129:
            opponent = df.iloc[index, 4]
            games = subset[(subset['Team'] == team) | (subset['XXTeam'] == team)]
            win_pct = len(games[(games['XXTeam'] == team) & (games['XXW_L'] == 1)]) / len(games[games['XXTeam'] == team])
            opp_games = subset[(subset['Team'] == opponent) | (subset['XXTeam'] == opponent)]
            
        pt_dif_vs_opp = opp_games[opp_games['Team'] == team]['Temp'].sum() + opp_games[opp_games['XXTeam'] == team]['XXTemp'].sum() - games[games['Team'] == opponent]['Temp'].sum() - games[games['XXTeam'] == opponent]['XXTemp'].sum()
        l5 = games.iloc[-5:]
        away_games_l5 = len(l5[l5['XXTeam'] == team])
        pt_dif_vs_topx = games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['Temp'].sum() + games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['XXTemp'].sum() - games[(games['Team'] == team) & (games['XXElo'] >= subset['XXElo'].quantile(quantile))]['XXTemp'].sum() -  games[(games['XXTeam'] == team) & (games['Elo'] >= subset['Elo'].quantile(quantile))]['Temp'].sum()
    
        return [win_pct, pt_dif_vs_opp, away_games_l5, pt_dif_vs_topx]
    except:
        return [None, None, None, None]

In [8]:
data = pd.read_csv(str(Path.cwd()) + '/lineup_data/cleanData.csv')
teams = data.loc[:, 'HomeTeam'].unique()
seasons = ['13-14', '14-15', '15-16', '16-17', '17-18', '18-19', '19-20', '20-21', '21-22']

In [10]:
final = pd.read_csv(str(Path.cwd()) + '/lineup_data/averagesFinal.csv')
final = final.assign(Temp=data['Points'], XXTemp=data['XXPoints'])

In [11]:
final_by_season = [final[final['Season'] == season] for season in seasons]
data_by_season = [data[data['Season'] == season] for season in seasons]

In [12]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking when at home
    # Adds home team's defense ranking while away
    # Adds away team's offense ranking when at home
    # Adds away team's defense ranking while away
    final_by_season[idx] = df.assign(OffRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'off', 'HomeTeam') for index in range(len(df))],
                                    DefRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'def', 'HomeTeam') for index in range(len(df))],
                                    XXOffRank=[stat_ranking(
        data, index, 'XXPoints', seasons[idx], 'off', 'XXAwayTeam') for index in range(len(df))],
                                    XXDefRank=[stat_ranking(
        data, index, 'Points', seasons[idx], 'def', 'XXAwayTeam') for index in range(len(df))])
    
    final_by_season[idx]['W_L'] = np.where(data_by_season[idx]['Points'] > data_by_season[idx]['XXPoints'], 1, 0)
    final_by_season[idx]['XXW_L'] = np.where(data_by_season[idx]['Points'] < data_by_season[idx]['XXPoints'], 1, 0)

In [13]:
for idx, df in enumerate(final_by_season):
    # Adds home team's offense ranking
    # Adds home team's defense ranking
    # Adds away team's offense ranking
    # Adds away team's defense ranking
    final_by_season[idx] = df.assign(TotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    TotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('HomeTeam')) for index in range(len(df))],
                                    XXTotalOffRank=[total_ranking(
        data, index, seasons[idx], 'off', list(data.columns).index('XXAwayTeam')) for index in range(len(df))],
                                    XXTotalDefRank=[total_ranking(
        data, index, seasons[idx], 'def', list(data.columns).index('XXAwayTeam')) for index in range(len(df))])

In [14]:
# Adds home team's average points scored vs top 10 defenses
# Adds home team's average points allowed vs top 10 offenses
# Adds away team's average points scored vs top 10 defenses
# Adds away team's average points allowed vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        PtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[0] for index in range(len(df))],
        PtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[2] for index in range(len(df))],
        XXPtsScoredTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[0] for index in range(len(df))],
        XXPtsAllowedTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[2] for index in range(len(df))])

In [15]:
# Adds home team's W% vs top 10 defenses
# Adds home team's W% vs top 10 offenses
# Adds away team's W% vs top 10 defenses
# Adds away team's W% vs top 10 offenses
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        WinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('Team'))[1] for index in range(len(df))],
        WinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('Team'))[3] for index in range(len(df))],
        XXWinPctTop10Def=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[1] for index in range(len(df))],
        XXWinPctTop10Off=[sched_stren(
            df, index, list(df.columns).index('XXTeam'))[3] for index in range(len(df))])

In [16]:
for idx, df in enumerate(final_by_season):
    final_by_season[idx] = df.assign(
        HomeWinPct = [win_pct(df, index, list(df.columns).index('Team'), .5)[0] for index in range(len(df))],
        XXAwayWinPct = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[0] for index in range(len(df))],
        PtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('Team'), .5)[1] for index in range(len(df))],
        XXPtDifvsOppTeam = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[1] for index in range(len(df))],
        NumAwayLast5 = [win_pct(df, index, list(df.columns).index('Team'), .5)[2] for index in range(len(df))],
        XXNumAwayLast5 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[2] for index in range(len(df))],
        PtDifvsTop50 = [win_pct(df, index, list(df.columns).index('Team'), .5)[3] for index in range(len(df))],
        XXPtDifvsTop50 = [win_pct(df, index, list(df.columns).index('XXTeam'), .5)[3] for index in range(len(df))], 
        PtDifvsTop75 = [win_pct(df, index, list(df.columns).index('Team'), .75)[3] for index in range(len(df))],
        XXPtDifvsTop75 = [win_pct(df, index, list(df.columns).index('XXTeam'), .75)[3] for index in range(len(df))])

In [17]:
final_df = pd.concat(final_by_season, axis=0)

In [20]:
final_df.to_csv(str(Path.cwd()) + '/lineup_data/final_updated.csv')

In [3]:
# This function finds the rating for each player.
def get_value(row, df):
    if len(df.loc[(df['player_name'] == row['Starters']) & (df['season'] == row['FG%']) & (df['team'] == row['FGA']), 'rating']) == 0:
        return -1
    else:
        return df.loc[(df['player_name'] == row['Starters']) & (df['season'] == row['FG%']) & (df['team'] == row['FGA']), 'rating'].item()

In [4]:
# This function sums the adjusted raptor rating for the 5 highest rated players who played in a particular game.
def adj_raptor(row, lineups, home_away):
    ratings = list(lineups.loc[(lineups['FG'] == row[home_away]) & (lineups['FTA'] != ('Did Not Dress' or 'Did Not Play' or 'Not With Team' or 'Player Suspended')), 'MP'])
    return sum(sorted(ratings, reverse=True)[:5])

In [5]:
# Reading the data in here because the chunks above take a long time to run. I needed to find the running average \
# for each game which requires looping over every row.
final = pd.read_csv(str(Path.cwd()) + '/lineup_data/final_updated.csv')
final = final.drop(['Unnamed: 0'], axis=1)
final.loc[final.Team == 'PHX', 'Team'] = 'PHO'
final.loc[final.Team == 'BKN', 'Team'] = 'BRK'
final.loc[final.XXTeam == 'BKN', 'XXTeam'] = 'BRK'
final.loc[final.XXTeam == 'PHX', 'XXTeam'] = 'PHO'
final['TeamID'] = final['Date'].str.cat(final['Team'])
final['XXTeamID'] = final['Date'].str.cat(final['XXTeam'])

In [6]:
seasons = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
season_lineups = [get_season_lineups(season) for season in seasons]












In [7]:
for i, season in enumerate(season_lineups):
    for idx, lineup in enumerate(season):
        season[idx] = lineup.assign(Age=seasons[i])
        
lineups = [lineup for season in season_lineups for lineup in season]
lineups = pd.concat(lineups, axis=0)
lineups["Unnamed: 1"] = lineups['Unnamed: 1'].str.replace('[^\w\s]','', regex=True)
lineups['Unnamed: 1'] = lineups['Unnamed: 1'].apply(unidecode)
lineups.loc[lineups['Rk'] == 'CHO', 'Rk'] = 'CHA'

In [8]:
raptor = pd.read_csv(str(Path.cwd()) + '/modern_RAPTOR_by_team.csv')
raptor = raptor.loc[raptor['season_type'] == 'RS']
# Here I add the absolute value of the lowest raptor rating to every rating to make them all positive.
# This is so I can account for some players who played very few minutes having very high ratings.
ratings = pd.DataFrame(data={'player_name': raptor['player_name'], 'season': raptor['season'], 'team': raptor['team'], 'rating': (raptor['raptor_total'] + abs(min(raptor['raptor_total']))) * raptor['mp']})
# I divide by number of games a player played and not total number of games for their team so that players' ratings \
# are not determined by how healthy they were in a particular season.
ratings['rating'] = ratings.apply(
    lambda row: row['rating'] / lineups.loc[(lineups['Unnamed: 1'] == row['player_name']) & (lineups['Age'] == row['season']) & (lineups['Rk'] == row['team']), 'G'].item() if len(lineups.loc[(lineups['Unnamed: 1'] == row['player_name']) & (lineups['Age'] == row['season']) & (lineups['Rk'] == row['team']), 'G']) != 0 else None, axis=1)

In [13]:
data_path = str(Path.cwd()) + '/lineup_data/lineups'
data_list = [pd.read_excel(str(file)) for file in pathlib.Path(data_path).glob('*.xlsx')]
data_list[0].columns, data_list[1].columns, data_list[8].columns = data_list[2].columns, data_list[2].columns, data_list[2].columns
data_list[0].drop([0, 1], axis=0, inplace=True)
data_list[8].drop([0, 1], axis=0, inplace=True)
# Here I format the game lineup data so that I can match each players rating and then assign that rating.
for idx, df in enumerate(data_list):
    data_list[idx].reset_index(drop=True, inplace=True)
    data_list[idx].drop(['Unnamed: 0'], axis=1, inplace=True)
    data_list[idx]['Starters'] = data_list[idx]['Starters'].str.replace('[^\w\s]','', regex=True)
    data_list[idx]['Starters'] = data_list[idx]['Starters'].apply(unidecode)
    data_list[idx]['FG'] = data_list[idx]['FG'].str.replace(r'CHO', 'CHA', regex=True)
    data_list[idx]['FG%'] = seasons[idx]
    data_list[idx]['MP'] = data_list[idx].apply(lambda row: get_value(row, ratings), axis=1)

In [14]:
lineups = pd.concat(data_list, axis=0)
final['AdjRaptor'] = final.apply(lambda row: adj_raptor(row, lineups, 'TeamID'), axis=1)
final['XXAdjRaptor'] = final.apply(lambda row: adj_raptor(row, lineups, 'XXTeamID'), axis=1)

In [17]:
final.to_csv(str(Path.cwd()) + '/lineup_data/final_updated.csv')