# Detailed player analysis

In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime

import plotly 
import plotly.graph_objects as go
import time

sys.path.insert(0, "./../../src/")
from data_loader import load_data, load_player_data, player_data_preprocessing

# player_df = pd.read_csv("player_info_v3.csv")
player_df = pd.read_csv("../../data/raw/player-data/player_info.csv")
# columns = ["Player ID"] + list(player_df.columns)[1:]
# player_df_all = load_player_data()
# player_df.columns = columns
player_df = player_df.drop(columns=["Unnamed: 0"])

# filter data to be only until the given season
player_df = player_df[player_df["Season"].str.split("-").str[0].astype(int) < 2019]
player_df

# Distribution of stays in the NBA

In [None]:
# player_df["season_name_int"] = player_df["season_name"].str.replace("-", "").astype(int)
grouped_player = player_df[["Season", "Player ID"]].groupby("Player ID")\
                    .count().sort_values(by="Season", ascending=False)
grouped_player.columns = ["season_count"]

print(f"Average stay in NBA is: {grouped_player['season_count'].mean()}")

fig_hw = go.Figure([go.Histogram(x=grouped_player["season_count"])])
fig_hw.update_layout(
    title_text="Distribution of player stays in NBA", 
    xaxis_title="Seasons in NBA",
    yaxis_title="Count"
)
fig_hw.show()

In [None]:
top_10_longest_in_nba = grouped_player.reset_index()\
                        .merge(player_df, left_on='Player ID', right_on='Player ID')\
                        [["Player", "season_count"]].drop_duplicates()[:13]
fig_countries = go.Figure([go.Bar(
    x=top_10_longest_in_nba["Player"], 
    y=top_10_longest_in_nba["season_count"]
    )]
)
fig_countries.update_layout(
    title_text="Top 13 players with the most seasons played (2000-2019)", 
    yaxis_title="Count",
    xaxis_title="Player",
)
fig_countries.update_traces(
    textposition='outside'
)
fig_countries.show()

display(top_10_longest_in_nba)

In [None]:
list(top_10_longest_in_nba["Player"])

## Club changes 
Finding the players with most club changes and clubs with highest player count.  
   
!! It can't take into account club changes mid season since we only have one data point per season

In [None]:
unique_teams = player_df['Team'].unique()
print(f"Number of unique teams {len(unique_teams)}: {unique_teams}")
player_grouped_clubs = player_df.groupby("Player ID").agg({"Team": list})
# display(player_grouped_clubs)

player_grouped_clubs["Club changes"] = np.nan
for index, row in player_grouped_clubs.iterrows():
    if len(row["Team"]) < 1:
        raise Exception("Player was not in any club!!!", row["Player"])
    
    club_changes = 0
    prev_club = row["Team"][0]
    for club in row["Team"][1:]:
        if prev_club != club:
            club_changes += 1
            prev_club = club
            
    player_grouped_clubs.loc[index, "Club changes"] = club_changes
player_grouped_clubs["Club changes"] = player_grouped_clubs["Club changes"].astype(int)
display(player_grouped_clubs)

Histogram of club changes.

In [None]:
print("Median club changes: ", player_grouped_clubs["Club changes"].median())
print(player_grouped_clubs["Club changes"].describe())

fig_clubs= go.Figure([go.Histogram(x=player_grouped_clubs["Club changes"])])
fig_clubs.update_layout(
    title_text="Distribution of club changes per player", 
    yaxis_title="Count",
    xaxis_title="Club changes (0 means that they stayed in the same club they began their career in)",
)
fig_clubs.update_layout(bargap=0.2)
fig_clubs.show()


Players with most club changes.

In [None]:
merged_club_changes = player_grouped_clubs.reset_index()\
                        .merge(player_df, left_on='Player ID', right_on='Player ID')\
                        [["Player", "Player ID", "Club changes"]].drop_duplicates()
sorted_by_club_change = merged_club_changes.sort_values("Club changes", ascending=False)

# 14 since, that get's all the players that changed more than 8 clubs
display(sorted_by_club_change[:14])

# Finding clubs with the most and least players in each season

In [None]:
unique_teams = player_df['Team'].unique()
print(f"Number of unique teams {len(unique_teams)}: {unique_teams}")

# group by team and then also season to since we want to know the player count each season
clubs_grouped = player_df.groupby(["Team", "Season"])[["Player ID", "Player"]].count() 
# .agg({"Player ID": list})
clubs_grouped

In [None]:
fig_club_size = go.Figure([go.Histogram(x=clubs_grouped["Player ID"])])
fig_club_size.update_layout(
    title_text="Number of player in one club in all seasons", 
    yaxis_title="Count",
    xaxis_title="Number of players in club",
)
fig_club_size.update_layout(bargap=0.2)
fig_club_size.show()

Clubs which had the most and least players in which ever season.

In [None]:
print("Clubs with highest player count")
display(clubs_grouped.nlargest(10, "Player ID"))
print("Clubs with lowest player count")
display(clubs_grouped.nsmallest(10, "Player ID"))

print("Median players in club", clubs_grouped["Player ID"].median())
print(clubs_grouped["Player ID"].describe())