## Data Wrangling

[Basketball Reference](https://www.basketball-reference.com/) (much like Baseball and Hockey Reference) is an amazing website with regular and advanced statistics for basketball players. I found the awesome [basketball_reference_web_scraper](https://github.com/jaebradley/basketball_reference_web_scraper) Python package by @jaebradley and contributors that makes scraping the site easy via API and has equally awesome [documentation](https://jaebradley.github.io/basketball_reference_web_scraper/) and examples.

In [None]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType
import pandas as pd
from time import sleep
import plotly.express as px
from IPython.display import display, HTML

In [None]:
# Get regular and advanced player season stats 
data_reg = []
data_adv = []
years =  [2021]
response_reg = []
response_adv = []
for year in years:
    try:
        #regular stats
        data_reg = pd.read_json(client.players_season_totals(season_end_year=year, 
                                                             output_type=OutputType.JSON))
        data_reg['season_end_year'] = year
        response_reg.append(data_reg)
        #advanced stats
        data_adv = pd.read_json(client.players_advanced_season_totals(season_end_year=year, 
                                                                  output_type=OutputType.JSON))
        data_adv['season_end_year'] = year
        response_adv.append(data_adv)     
    except:
        pass

combined_reg = pd.concat(response_reg)
combined_adv = pd.concat(response_adv)
keep_columns = combined_adv.columns.difference(combined_reg.columns)
player_stats_combined = pd.concat([combined_reg, combined_adv[keep_columns]], axis=1)
#player_stats_combined.to_csv('player_stats_combined-2018-2020.csv', index=False)

In [None]:
player_stats_combined.columns

In [None]:
# WIP
# Game boxscores for each player by season
# data = []
# player_boxscores = []
# players = []
# players = list(player_stats_combined['slug'])
# years =  [2021]
# response = []
# for year in years:
#     for player in players:
#         try:
#             data = pd.read_json(client.regular_season_player_box_scores(player_identifier=player, 
#                                                                     season_end_year=year, 
#                                                                     output_type=OutputType.JSON))
#             data['slug'] = player
#             data['season_end_year'] = year
#             response.append(data)
#         except:
#             pass
#     sleep(4)    
    
    
# player_boxscores = pd.concat(response)
#player_boxscores.to_csv('player_boxscores-2018-2020.csv', index=False)

# Data Analysis

In [None]:
# Fantasy categories
## FG%, FT%, 3PTM, PTS, Total REB, AST, ST, BLK, TO
#player_stats_combined = pd.read_csv('player_stats_combined-2018-2020.csv')

In [None]:
player_stats_combined.columns

In [None]:
## Relevant fantasy categories FG%, FT%, 3PTM, PTS, Total REB, AST, ST, BLK, TO
## Calculating non-percent fantasy cateogories combined values
player_stats_combined['fantasy_points'] = (player_stats_combined['made_three_point_field_goals']+
                                           player_stats_combined['points']+
                                           player_stats_combined['offensive_rebounds']+
                                           player_stats_combined['defensive_rebounds']+
                                           player_stats_combined['assists']+
                                           player_stats_combined['steals']+
                                           player_stats_combined['blocks']-
                                           player_stats_combined['turnovers'])
player_stats_combined['fantasy_points_gm'] = player_stats_combined['fantasy_points']/player_stats_combined['games_played']
player_stats_combined['fantasy_points_per36min'] = player_stats_combined['fantasy_points']/player_stats_combined['minutes_played']*36
player_stats_combined['season_end_year'] = player_stats_combined['season_end_year'].astype(str)

In [None]:
## Filtering data 
# minutes based on 25% quartile of all data and true shooting percentage at median
print(len(player_stats_combined))
print(player_stats_combined['minutes_played'].describe())
print(player_stats_combined['true_shooting_percentage'].describe())

In [None]:
# true shooting percentage
## Prioritize players who excel in this stat. 
## A .500 TS% is about average, .550 is very good, and anything over .600 is exceptional.
player_stats_combined_filter = pd.DataFrame
player_stats_combined_filter = player_stats_combined[(player_stats_combined['minutes_played']>=55.75) & 
                                                     (player_stats_combined['true_shooting_percentage']>=0.474)]
display(HTML(player_stats_combined_filter.sort_values(by=['fantasy_points'],ascending=False,ignore_index=True).to_html()))

In [None]:
# random data visualization and exploration
fig = px.scatter(player_stats_combined_filter,
                 x='age',
                 y='fantasy_points',
                 color='fantasy_points_per36min',
                 color_discrete_sequence= px.colors.qualitative.G10,
                 hover_name='name',
                 hover_data=['season_end_year','true_shooting_percentage','win_shares'])
fig.show()

In [None]:
# random data visualization and exploration
fig2 = px.scatter(player_stats_combined_filter,
                 x='win_shares',
                 y='fantasy_points',
                 color='age',
                 facet_col='season_end_year',
                 color_discrete_sequence= px.colors.qualitative.G10,
                 hover_name='name',
                 hover_data=['season_end_year','true_shooting_percentage','games_played'])
fig2.show()

In [None]:
#player_stats_combined_filter.sort_values(by='fantasy_points', ascending=False).to_csv('player_stats_combined-2018-2020.csv', index=False)
player_stats_combined_filter.to_csv('player_stats_combined_filter-2018-2020.csv', index=False)


In [None]:
# Who to keep?
## folks on my team from last year ... who to keep?
player_list = ['gilgesh01','siakapa01','foxde01',
               'russeda01','holidjr01','brownja02',
               'loveke01','lopezbr01','bogdabo01',
               'schrode01','bridgmi02','beaslma01']

player_stats_combined_filter_team = pd.DataFrame()
player_stats_combined_filter_team = player_stats_combined_filter[player_stats_combined_filter['slug'].isin(player_list)]

fig3 = px.scatter(player_stats_combined_filter_team,
                 x='season_end_year',
                 y='fantasy_points',
                 color='age',
                 facet_col='slug',
                 facet_col_wrap=4,
                 color_discrete_sequence= px.colors.qualitative.G10,
                 hover_name='name',
                 hover_data=['season_end_year','true_shooting_percentage','win_shares','games_played'])
fig3.show()


In [None]:
# Miscellaneous Player Lookup
## Kat, Steph Curry, Nikola Vucebic, Blake Griffin, 
## AVOID LBJ, Kawhi, Joel, KD, Kyrie, Kemba, Russ, ChrisPaul, Jimmy Butler, Kristaps, Jaren JacksonJR, 
player_list = ['bogdabo01','grantje01','conlemi01','smartma01','horfoal01',
              'randlju01','allenja01']

player_stats_combined_select = pd.DataFrame()
player_stats_combined_select = player_stats_combined_filter[player_stats_combined_filter['slug'].isin(player_list)]

fig4 = px.scatter(player_stats_combined_select,
                 x='season_end_year',
                 y='fantasy_points',
                 color='age',
                 facet_col='slug',
                 facet_col_wrap=4,
                 color_discrete_sequence= px.colors.qualitative.G10,
                 hover_name='name',
                 hover_data=['season_end_year','true_shooting_percentage','win_shares','games_played'])
fig4.show()



In [None]:
# TO DO - combine dataframes and make use of game by game boxscores 
## - Add some functionality to make things more robust and add some features like
## - easily look up a player of interest, dynamic filtering, and some predictions 
## (game, season, who to pick during a draft).