# Detailed player analysis

In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime

import plotly 
import plotly.graph_objects as go
import time

sys.path.insert(0, "./../../src/")
from data_loader import load_data, load_player_data, player_data_preprocessing

# player_df = pd.read_csv("player_info_v3.csv")
player_df = pd.read_csv("../../data/raw/player-data/player_info.csv")
# columns = ["Player ID"] + list(player_df.columns)[1:]
# player_df_all = load_player_data()
# player_df.columns = columns
player_df = player_df.drop(columns=["Unnamed: 0"])

# filter data to be only until the given season
player_df = player_df[player_df["Season"].str.split("-").str[0].astype(int) < 2019]
player_df

# Distribution of stays in the NBA

In [None]:
# player_df["season_name_int"] = player_df["season_name"].str.replace("-", "").astype(int)
grouped_player = player_df[["Season", "Player ID"]].groupby("Player ID")\
                    .count().sort_values(by="Season", ascending=False)
grouped_player.columns = ["season_count"]

print(f"Average stay in NBA is: {grouped_player['season_count'].mean()}")

fig_hw = go.Figure([go.Histogram(x=grouped_player["season_count"])])
fig_hw.update_layout(
    title_text="Distribution of player stays in NBA", 
    xaxis_title="Seasons in NBA",
    yaxis_title="Count"
)
fig_hw.show()

In [None]:
top_10_longest_in_nba = grouped_player.reset_index()\
                        .merge(player_df, left_on='Player ID', right_on='Player ID')\
                        [["Player", "season_count"]].drop_duplicates()[:13]
fig_countries = go.Figure([go.Bar(
    x=top_10_longest_in_nba["Player"], 
    y=top_10_longest_in_nba["season_count"]
    )]
)
fig_countries.update_layout(
    title_text="Top 13 players with the most seasons played (2000-2019)", 
    yaxis_title="Count",
    xaxis_title="Player",
)
fig_countries.update_traces(
    textposition='outside'
)
fig_countries.show()

display(top_10_longest_in_nba)

In [None]:
list(top_10_longest_in_nba["Player"])

## Club changes 
Finding the players with most club changes and clubs with highest player count.  
   
!! It can't take into account club changes mid season since we only have one data point per season

In [None]:
unique_teams = player_df['Team'].unique()
print(f"Number of unique teams {len(unique_teams)}: {unique_teams}")
player_grouped_clubs = player_df.groupby("Player ID").agg({"Team": list})
# display(player_grouped_clubs)

player_grouped_clubs["Club changes"] = np.nan
for index, row in player_grouped_clubs.iterrows():
    if len(row["Team"]) < 1:
        raise Exception("Player was not in any club!!!", row["Player"])
    
    club_changes = 0
    prev_club = row["Team"][0]
    for club in row["Team"][1:]:
        if prev_club != club:
            club_changes += 1
            prev_club = club
            
    player_grouped_clubs.loc[index, "Club changes"] = club_changes
player_grouped_clubs["Club changes"] = player_grouped_clubs["Club changes"].astype(int)
display(player_grouped_clubs)

Histogram of club changes.

In [None]:
print("Median club changes: ", player_grouped_clubs["Club changes"].median())
print(player_grouped_clubs["Club changes"].describe())

fig_clubs= go.Figure([go.Histogram(x=player_grouped_clubs["Club changes"])])
fig_clubs.update_layout(
    title_text="Distribution of club changes", 
    yaxis_title="Count",
    xaxis_title="Club changes (0 means that they stayed in the same club they began their career in)",
)
fig_clubs.update_layout(bargap=0.2)
fig_clubs.show()

Players with most club changes.

In [None]:
merged_club_changes = player_grouped_clubs.reset_index()\
                        .merge(player_df, left_on='Player ID', right_on='Player ID')\
                        [["Player", "Player ID", "Club changes"]].drop_duplicates()
sorted_by_club_change = merged_club_changes.sort_values("Club changes", ascending=False)

# 14 since, that get's all the players that changed more than 8 clubs
display(sorted_by_club_change[:14])
# sorted_by_club_change[:14].to_csv("temp_table.csv", index=False)

In [None]:
player_df[player_df["Player ID"] == 1730]

# Finding clubs with the most and least players in each season

In [None]:
unique_teams = player_df['Team'].unique()
print(f"Number of unique teams {len(unique_teams)}: {unique_teams}")

# group by team and then also season to since we want to know the player count each season
clubs_grouped = player_df.groupby(["Team", "Season"])[["Player ID", "Player"]].count() 
# .agg({"Player ID": list})
clubs_grouped

In [None]:
fig_club_size = go.Figure([go.Histogram(x=clubs_grouped["Player ID"])])
fig_club_size.update_layout(
    title_text="Number of player per club in all seasons", 
    yaxis_title="Count",
    xaxis_title="Number of players in club",
)
fig_club_size.update_layout(bargap=0.2)
fig_club_size.show()

Clubs which had the most and least players in which ever season.

In [None]:
print("Clubs with highest player count")
display(clubs_grouped.nlargest(3, "Player ID"))
print("Clubs with lowest player count")
display(clubs_grouped.nsmallest(8, "Player ID"))
# clubs_grouped.nsmallest(8, "Player ID")["Player"].to_csv("temp_table.csv")

print("Median players in club", clubs_grouped["Player ID"].median())
print(clubs_grouped["Player ID"].describe())

# Average player count in clubs through the seasons 

In [None]:
# median is 15 in all seasons
players_in_club_per_year_median = clubs_grouped["Player ID"].groupby("Season").median()
players_in_club_per_year_median.index = players_in_club_per_year_median.index.str.replace("-", "-20")

players_in_club_per_year_mean = clubs_grouped["Player ID"].groupby("Season").mean()
players_in_club_per_year_mean.index = players_in_club_per_year_mean.index.str.replace("-", "-20")

fig_club_changes = go.Figure(
    [go.Scatter(x=players_in_club_per_year_median.index,y=players_in_club_per_year_median.values, 
                name="Median"),
     go.Scatter(x=players_in_club_per_year_mean.index,y=players_in_club_per_year_mean.values, 
                name="Mean")]) 
fig_club_changes.update_layout(
    title_text="Players per club (median-mean)", 
    yaxis_title="Players in club",
    xaxis_title="Season",
)
fig_club_changes.update_yaxes(rangemode="tozero")
fig_club_changes.show()

In [None]:
# todo player can change mid season, mention that
# correlation between player changes and their statistics
player_df[player_df["Player"] == 'Mike James']

## Club change with points scored corelation
Column names meaning:   
GP - Games Played   
PTS - Points     
REB - Rebounds   
AST - Assists   
NetRtg - Net Rating   
OREB% - Offensive Rebound Percent    
DREB% - Defensive Rebound Percent   
USG% - Usage Percent   
TS% - True Shot Percent   
AST% - Assist Percent  

In [None]:
# add a new column to df
player_df["Season_20"] = player_df["Season"].str.replace("-", "-20")
# boolean, if True player changed clubs between this season and the previous one
player_df["changed_club"] = False

for index, row in player_df.sort_values("Season_20").iterrows():
    if row["Season_20"] == "2000-2001":
        # noone can change clubs in the first season we have data for
        continue

    split_season = row["Season_20"].split("-")
    prev_season = str((int(split_season[0]) - 1)) + "-" + str((int(split_season[1]) - 1))
    
    prev_player_row = player_df.loc[(player_df["Player ID"] == row["Player ID"]) & 
                  (player_df["Season_20"] == prev_season)]
    if len(prev_player_row) > 0:
        # If a player existed in the previous season

        if str(prev_player_row["Team"].values[0]) != str(row["Team"]):
            player_df.loc[index, "changed_club"] = True
player_df

In [None]:
# player_df["PTS"].sort_values()
metrics_col_arr = ['GP', 'PTS', 'REB', 'AST']  # , 'NetRtg'- add this if needed
only_player_rows = player_df[player_df["Player ID"] == 1630]
# display(only_player_rows)

def plot_player_performance(df, player_ids):
    for p_id in player_ids:
        curr_player_rows = df[df["Player ID"] == p_id]
        player_name = curr_player_rows['Player'].iloc[0]
        print(f"Player id {p_id}, name {player_name}")
        
        scatter_arr = []
        for column in metrics_col_arr:
            curr_scatter = go.Scatter(x=curr_player_rows["Season_20"], 
                                      y=curr_player_rows[column], name=column)
            scatter_arr.append(curr_scatter)

        fig_club_changes = go.Figure(scatter_arr) 
        changed_seasons = curr_player_rows[curr_player_rows["changed_club"] == True]["Season_20"].values
        # print(changed_seasons)
        for season_str in changed_seasons:
            fig_club_changes.add_vline(x=season_str, line_dash="dash", opacity=0.75)
        fig_club_changes.update_layout(
            title_text=f"{player_name} statistics through the seasons, with marked club changes", 
            yaxis_title="Value",
            xaxis_title="Season",
        )
        fig_club_changes.update_yaxes(rangemode="tozero")
        fig_club_changes.show()
        
top_14_player_ids = list(sorted_by_club_change[:14]["Player ID"].values) + [2544, 201939, 202695]
plot_player_performance(player_df, top_14_player_ids)

In [None]:
# Just description on how to do it
# we would want to know how the player score increased or decreased if a club change happended
# we look at each player, if changed is true, compare his scores before the change with 
# the ones after it and before it

# idea 2, take the diff between sorted values, compare rows where the change club was false and where
# it was true

# GET superstar ids
display(player_df[player_df["Player"].isin(["LeBron James", "Stephen Curry", "Kawhi Leonard"])]\
        ["Player ID"].unique())

In [None]:
grouped_arrs_df = player_df.sort_values("Season_20").groupby(["Player ID"])\
                        .agg({"Season_20": list, "changed_club": list, "PTS": list}) 

grouped_arrs_df["club_change_pts"] = np.nan
grouped_arrs_df["no_club_change_pts"] = np.nan

for index, row in grouped_arrs_df.iterrows():
    club_change_pts = [0, 0]
    no_club_change_pts = [0, 0] 
    
    for changed_club, pts in zip(row["changed_club"], row["PTS"]):
        if changed_club:
            club_change_pts[0] += pts
            club_change_pts[1] += 1
        else:
            no_club_change_pts[0] += pts
            no_club_change_pts[1] += 1
    # 1. normalize by the number of seasons in they changed or didn't change clubs
    # 2. only set column if there were any seasons played 
    if club_change_pts[1] > 1:
        grouped_arrs_df.loc[index, "club_change_pts"] = club_change_pts[0] / club_change_pts[1]

    if no_club_change_pts[1] > 1:
        grouped_arrs_df.loc[index, "no_club_change_pts"] = no_club_change_pts[0] / no_club_change_pts[1]
display(grouped_arrs_df)

In [None]:
display(grouped_arrs_df["club_change_pts"].describe())
display(grouped_arrs_df["no_club_change_pts"].describe())
print("Combined average points")
display((grouped_arrs_df["club_change_pts"] + grouped_arrs_df["no_club_change_pts"]).describe())

In [None]:
box_plot_arr = [go.Box(y=grouped_arrs_df['no_club_change_pts'], name="Stayed in club"), 
                go.Box(y=grouped_arrs_df['club_change_pts'], name="Changed club")]

fig_club_changes = go.Figure(box_plot_arr) 

fig_club_changes.update_layout(
    title_text=f"Impact of club change on point average by players", 
    yaxis_title="Points scored"
)
fig_club_changes.update_yaxes(rangemode="tozero")
fig_club_changes.show()