## Player data analysis

## 1. Demographic data
1. Average age through the seasons and stacked bar charts
2. Average weight and height, + normal distributions for each season
3. Origin of players by countries

In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime

import plotly 
import plotly.graph_objects as go
import time

sys.path.insert(0, "./../../src/")
from data_loader import load_data, load_player_data, player_data_preprocessing

# load it 
s_time = time.time()
# moved the code from notebook, to src function
player_df = player_data_preprocessing()

print(f"Execution took {time.time() - s_time} seconds")

display(player_df.head())

In [None]:
player_df.to_csv("player_info_v3.csv")
display(player_df.columns)

## Average player information

In [None]:
# group by seasons
selected_cols = ["Age", "Height", "Weight", "games_played", "season_name"]
grouped_by_season = player_df[selected_cols].groupby("season_name")
season_avg = grouped_by_season.mean()
# season_sum = grouped_by_season.sum()
season_avg.index = season_avg.index.str.replace("-", "-20")
display(season_avg)

In [None]:
print("Averages ")
print(season_avg.mean(), "\n\nStd", season_avg.std())

# hidden because barcharts don't change
"""fig_avg_age = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Age"])])
fig_avg_age.update_layout(
    title_text="Bar chart of average age through the seasons", 
    xaxis_title="Season",
    yaxis_title="Age in years"
)
fig_avg_age.show()

fig_avg_h = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Height"])])
fig_avg_h.update_layout(
    title_text="Bar chart of average height through the seasons", 
    xaxis_title="Season",
    yaxis_title="Height in cm"
)
fig_avg_h.show()

fig_avg_w = go.Figure([go.Bar(x=season_avg.index, y=season_avg["Weight"])])
fig_avg_w.update_layout(
    title_text="Bar chart of average weight through the seasons", 
    xaxis_title="Season",
    yaxis_title="Weight in kg"
)
fig_avg_w.show()

fig_avg_g = go.Figure([go.Bar(x=season_avg.index, y=season_avg["games_played"])])
fig_avg_g.update_layout(
    title_text="Bar chart of average games played through the seasons", 
    xaxis_title="Season",
    yaxis_title="Number of games played"
)
fig_avg_g.show()""";

## Scatter of height / weight correlations

In [None]:
df_2018_s = player_df[player_df["season_name"] == "2018-19"]
# Plotly strip plot
size_col = (df_2018_s["points"] /  df_2018_s["points"].max()) * 15 + 5

colors_arr = ['#636EFA', '#EF553B', '#00CC96', 
              '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', 
              '#B6E880', '#FF97FF', '#FECB52']
countries_arr = ["USA", "Australia", "Canada", "France"]
def SetColor(x):
    x = x.strip()
    if x == countries_arr[0]:
        return colors_arr[0]
    elif x == countries_arr[1]:
        return colors_arr[1]
    elif x == countries_arr[2]:
        return colors_arr[2]
    elif x == countries_arr[3]:
        return colors_arr[3]

color_dict = {"USA": colors_arr[0],  "Australia": colors_arr[1], "Canada": colors_arr[2],
             "France": colors_arr[3]}
fig_hw = go.Figure()

for country_name in countries_arr:
    df_2018_country = df_2018_s[df_2018_s["Country"] == country_name]
    noise = np.random.randn(len(df_2018_country)) + 0.01

    fig_hw.add_trace(go.Scatter(
        x=df_2018_country["Weight"] + noise, 
        y=df_2018_country["Height"] + noise, 
        text=df_2018_country["player_name"].astype(str) + \
        " - Points:" + df_2018_country["points"].astype(str), 
        marker=dict(size=5, color=list(map(SetColor, df_2018_country["Country"]))),
        mode='markers',
        name=country_name,
        showlegend=True)
)
    
df_2018_others = df_2018_s[~df_2018_s["Country"].isin(countries_arr)]
noise_2 = np.random.randn(len(df_2018_others)) + 0.01
fig_hw.add_trace(go.Scatter(
    x=df_2018_others["Weight"] + noise_2, 
    y=df_2018_others["Height"] + noise_2, 
    text=df_2018_others["player_name"].astype(str) + \
    " - Points:" + df_2018_others["points"].astype(str), 
    marker=dict(size=5, color=colors_arr[4]),
    mode='markers',
    name="Other",
    showlegend=True)
)

fig_hw.update_layout(
    title_text="Age and weights of the players for season 2018-19 (with random noise)", 
    xaxis_title="Weight (kg)",
    yaxis_title="Height (cm)"
)
fig_hw.show()

In [None]:
fig_hw = go.Figure([go.Histogram(x=df_2018_s["Age"])])
fig_hw.update_layout(
    title_text="Age distribution for season 2018-19", 
    xaxis_title="Age (years)",
    yaxis_title="Count"
)
fig_hw.update_xaxes(tick0=df_2018_s["Age"].min(), dtick=1)
fig_hw.show()

fig_hw = go.Figure([go.Histogram(x=df_2018_s["Weight"])])
fig_hw.update_layout(
    title_text="Weight distribution for season 2018-19", 
    xaxis_title="Weight (kg)",
    yaxis_title="Count"
)
fig_hw.update_xaxes(tick0=df_2018_s["Weight"].min(), dtick=4)
fig_hw.show()

fig_hw = go.Figure([go.Histogram(x=df_2018_s["Height"])])
fig_hw.update_layout(
    title_text="Height distribution for season 2018-19", 
    xaxis_title="Height (cm)",
    yaxis_title="Count"
)
fig_hw.update_xaxes(tick0=df_2018_s["Height"].min(), dtick=2.54)
fig_hw.show()

## Countries the players are from

In [None]:
countries_grouped = df_2018_s.groupby("Country").count().sort_values("season_name", ascending=False)

fig_countries = go.Figure([go.Bar(
    x=countries_grouped.index, 
    y=countries_grouped["season_name"], 
    )]
)
fig_countries.update_layout(
    title_text="Player countries", 
    yaxis_title="Count",
    xaxis_title="Country",
    xaxis_tickangle=50
)
fig_countries.show()

## Correlations between performance and the mentioned variables

In [None]:
fig_hw = go.Figure([go.Scatter(x=df_2018_s["Height"], y=df_2018_s["points"], 
       text=df_2018_s["player_name"].astype(str) + " - Points:" + df_2018_s["points"].astype(str), 
       mode='markers')]
)
fig_hw.update_layout(
    title_text="Ratio between height and points", 
    yaxis_title="Points",
    xaxis_title="Height (cm)"
)
fig_hw.update_xaxes(tick0=df_2018_s["Height"].min(), dtick=2.54)
fig_hw.show()

fig_hw = go.Figure([go.Scatter(x=df_2018_s["Weight"], y=df_2018_s["points"], 
       text=df_2018_s["player_name"].astype(str) + " - Points:" + df_2018_s["points"].astype(str), 
       mode='markers')]
)
fig_hw.update_xaxes(tick0=df_2018_s["Weight"].min(), dtick=2.54)
fig_hw.update_layout(
    title_text="Ratio between weight and points", 
    xaxis_title="Weight (kg)",
    yaxis_title="Points"
)
fig_hw.show()

In [None]:
import matplotlib.pyplot as plt

plt.matshow(player_df.corr())
plt.show()

display(player_df.corr())