## Player data analysis

## 1. Demographic data
1. Average age through the seasons and stacked bar charts
2. Average weight and height, + normal distributions for each season
3. Origin of players by countries

In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime

import plotly 
import plotly.graph_objects as go
import time

sys.path.insert(0, "./../../src/")
from data_loader import load_data, load_player_data

# load it 
s_time = time.time()
# player_df = load_player_data().reset_index()
"""
player_df.columns = ["Player ID", "season_name", "player_name", "Age", "Height", "Weight", "College",
       "Country", "Draft Year", "Draft Number", "Draft Number", "games_played",
       "fg_made", "fg_missed", "3PT_made", "3PT_missed", "ft_made",
       "ft_missed", "points", "rebounds", "assists", "turnover", "fouls"]
"""

# read it from file
player_df = pd.read_csv("merged_player_data_v1.csv")
print(f"Execution took {time.time() - s_time} seconds")

In [None]:
# Preprocessing 
# find nan values: player_df[pd.to_numeric(player_df["Weight"], errors="coerce").isna()]

#  pounds to kg
player_df["Weight"] = player_df["Weight"].replace("-", "0")
player_df["Weight"] = (player_df["Weight"].astype(int) *  0.45359237).astype(int)

# feet to cm 
split_data = player_df["Height"].str.split('-').apply(pd.Series).replace("", "0").astype(float)
player_df["Height"] = (split_data[0] * 30.48 + split_data[1] * 2.54).astype(int)

tmp_len = len(player_df.index)
# remove rows where height or data is zero since it skews the result
player_df = player_df[(player_df["Height"] != 0) | (player_df["Weight"] != 0)]
print(f"Removed {tmp_len - len(player_df.index)} rows with 0 values!")

display(player_df)

## Average player information

In [None]:
# group by seasons
selected_cols = ["Age", "Height", "Weight", "games_played", "season_name"]
grouped_by_season = player_df[selected_cols].groupby("season_name")
season_avg = grouped_by_season.mean()
# season_sum = grouped_by_season.sum()
season_avg.index = season_avg.index.str.replace("-", "-20")
display(season_avg)

In [None]:
print("Averages ")
print(season_avg.mean(), "\n\nStd", season_avg.std())
# fig_sub = make_subplots(rows=1, cols=3)

fig_avg_age = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Age"])])
fig_avg_age.update_layout(
    title_text="Bar chart of average age through the seasons", 
    xaxis_title="Season",
    yaxis_title="Age in years"
)
fig_avg_age.show()

fig_avg_h = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Height"])])
fig_avg_h.update_layout(
    title_text="Bar chart of average height through the seasons", 
    xaxis_title="Season",
    yaxis_title="Height in cm"
)
fig_avg_h.show()

fig_avg_w = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Weight"])])
fig_avg_w.update_layout(
    title_text="Bar chart of average weight through the seasons", 
    xaxis_title="Season",
    yaxis_title="Weight in kg"
)
fig_avg_w.show()

fig_avg_g = go.Figure([go.Bar(x=season_avg.index, y=season_avg["games_played"])])
fig_avg_g.update_layout(
    title_text="Bar chart of average games played through the seasons", 
    xaxis_title="Season",
    yaxis_title="Number of games played"
)
fig_avg_g.show()

## Scatter of height / weight correlations

In [None]:
df_2018_s = player_df[player_df["season_name"] == "2018-19"]
# display(df_2018_s)
fig_hw = go.Figure([go.Scatter(x=df_2018_s["Weight"], y=df_2018_s["Height"], text=df_2018_s["player_name"],
                            mode='markers')])
fig_hw.update_layout(
    title_text="Age to weight correlation for season 2018-19", 
    xaxis_title="Weight",
    yaxis_title="Height"
)
fig_hw.show()

In [None]:
fig_hw = go.Figure([go.Histogram(x=df_2018_s["Age"])])
fig_hw.update_layout(
    title_text="Age to weight correlation for season 2018-19", 
    xaxis_title="Age",
    yaxis_title="Count"
)
fig_hw.show()

fig_hw = go.Figure([go.Histogram(x=df_2018_s["Weight"])])
fig_hw.update_layout(
    title_text="Age to weight correlation for season 2018-19", 
    xaxis_title="Weight",
    yaxis_title="Count"
)
fig_hw.show()

fig_hw = go.Figure([go.Histogram(x=df_2018_s["Height"])])
fig_hw.update_layout(
    title_text="Age to weight correlation for season 2018-19", 
    xaxis_title="Height",
    yaxis_title="Count"
)
fig_hw.show()

## Correlations between performance and the mentioned variables

In [None]:
import matplotlib.pyplot as plt

plt.matshow(player_df.corr())
plt.show()

display(player_df.corr())