In [1]:
# Import the necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

The purpose of this notebook is to try to find the effects of shooting more or less midrange shots. Traditional Moreyball dictates that 3>2, and thus if your 3Pfg% is >2/3 of your 2Pfg%, a three has a higher expected value. However, it may be the case that taking more midrange changes fg% overall or ftr. In order to do so, I want to scrape the data from basketball reference for shots from distance. I tried to organize my code in an object oriented and clean way. 

In [2]:
#grab the html from bbref
def scrape_midrange(years):
    for year in years:
        url = "https://www.basketball-reference.com/leagues/NBA_{}_shooting.html".format(year)
        r = requests.get(url)
        with open("html/midrange_{}.html".format(year), "w+") as file:
            file.write(r.text)
        with open("html/midrange_{}.html".format(year), "r") as file:
            page = file.read()

        soup = BeautifulSoup(page, "html.parser")
        table = soup.find("table", id = "shooting_stats")
        df = pd.read_html(str(table), header = 0)[0]

        
        #turn the df into a csv
        df.to_csv("csv/midrange_{}.csv".format(year), index = False)

#renaming and dropping the top row
def rename_midrange(years):
    for year in years:
        df = pd.read_csv("csv/midrange_{}.csv".format(year))
        
        rename_list = ["Unnamed: 1", 
        "% of FGA by Distance", 
        "% of FGA by Distance.1", 
        "% of FGA by Distance.2",
        "% of FGA by Distance.3",
        "% of FGA by Distance.4",
        "% of FGA by Distance.5",
        "FG% by Distance",
        "FG% by Distance.1",
        "FG% by Distance.2",
        "FG% by Distance.3",
        "FG% by Distance.4",
        "FG% by Distance.5"]
        # drop all the columns not in rename_list
        df = df[rename_list]

        rename_map = {"% of FGA by Distance" : "pct_2P_{}".format(year), 
        "% of FGA by Distance.1" : "pct_0-3_{}".format(year), 
        "% of FGA by Distance.2" : "pct_3-10_{}".format(year),
        "% of FGA by Distance.3" : "pct_10-16_{}".format(year),
        "% of FGA by Distance.4" : "pct_16-3P_{}".format(year),
        "% of FGA by Distance.5" : "pct_3P_{}".format(year), 

        "FG% by Distance" : "att_2P_{}".format(year),
        "FG% by Distance.1" : "att_0-3_{}".format(year),
        "FG% by Distance.2" : "att_3-10_{}".format(year),
        "FG% by Distance.3" : "att_10-16_{}".format(year),
        "FG% by Distance.4" : "att_16-3P_{}".format(year),
        "FG% by Distance.5" : "att_3P_{}".format(year),
        
        "Unnamed: 1" : "Player".format(year)
        }
        #rename the columns
        df.rename(columns = rename_map, inplace = True)

        #drop the first row
        df.drop(df.index[0], inplace = True)

        df.to_csv("csv/midrange_{}.csv".format(year), index = False)

# get rid of the index rows, convert the columns to float, and average the rows with the same player name
def clean_midrange(df):
    # drop all the rows with player name player
    df = df[df["Player"] != "Player"]

    # replace the null values with 0
    df.fillna(0, inplace = True)

    #convert the columns
    for col in df.columns:
        if col != "Player":
            df[col] = df[col].astype(float)

    # average the rows with the same player name
    df = df.groupby("Player").mean().reset_index()

    return df

In [3]:
#combined scrape and rename, then merge the dataframes into one on player name
def scrape_and_rename(years):
        scrape_midrange(years)
        rename_midrange(years)
        df = pd.read_csv("csv/midrange_{}.csv".format(years[0]))
        for year in years[1:]:
            df2 = pd.read_csv("csv/midrange_{}.csv".format(year))
            df = pd.merge(df, df2, on = "Player")
        return df

#now we can just clean the concatenated dataframe, probably a cleaner way to do this... but it works   

In [4]:
df = clean_midrange(scrape_and_rename(list(range(2018, 2022))))

In [9]:
df


Unnamed: 0,Player,pct_2P_2018,pct_0-3_2018,pct_3-10_2018,pct_10-16_2018,pct_16-3P_2018,pct_3P_2018,att_2P_2018,att_0-3_2018,att_3-10_2018,...,pct_3-10_2021,pct_10-16_2021,pct_16-3P_2021,pct_3P_2021,att_2P_2021,att_0-3_2021,att_3-10_2021,att_10-16_2021,att_16-3P_2021,att_3P_2021
0,Aaron Gordon,0.605,0.275,0.109000,0.106000,0.114,0.395,0.497,0.718000,0.287000,...,0.167667,0.130333,0.063000,0.348667,0.538000,0.707667,0.403333,0.438667,0.299,0.325333
1,Abdel Nader,0.564,0.255,0.228000,0.040000,0.040,0.436,0.321,0.447000,0.235000,...,0.224000,0.043000,0.009000,0.371000,0.534000,0.683000,0.385000,0.200000,0.000,0.419000
2,Al Horford,0.700,0.198,0.255000,0.143000,0.104,0.300,0.514,0.738000,0.438000,...,0.225000,0.100000,0.083000,0.422000,0.510000,0.672000,0.444000,0.444000,0.433,0.368000
3,Al-Farouq Aminu,0.418,0.215,0.120000,0.026000,0.057,0.582,0.432,0.608000,0.314000,...,0.229667,0.021333,0.057000,0.440667,0.411333,0.359667,0.442000,0.000000,0.500,0.203000
4,Alec Burks,0.657,0.251,0.184000,0.130000,0.092,0.343,0.452,0.635000,0.303000,...,0.138000,0.128000,0.096000,0.492000,0.425000,0.507000,0.377000,0.422000,0.375,0.415000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,Will Barton,0.589,0.284,0.201000,0.057000,0.047,0.411,0.510,0.661000,0.377000,...,0.211000,0.072000,0.067000,0.423000,0.459000,0.630000,0.297000,0.455000,0.390,0.381000
273,Willie Cauley-Stein,0.984,0.459,0.301000,0.076000,0.146,0.016,0.506,0.676000,0.339000,...,0.082000,0.044000,0.022000,0.060000,0.667000,0.694000,0.533000,0.375000,0.750,0.091000
274,Willy Hernangómez,0.927,0.573,0.268333,0.042667,0.049,0.073,0.566,0.661667,0.454667,...,0.246000,0.063000,0.011000,0.037000,0.581000,0.634000,0.500000,0.412000,0.333,0.100000
275,Yogi Ferrell,0.508,0.151,0.084000,0.097000,0.178,0.492,0.477,0.582000,0.295000,...,0.095667,0.236000,0.082333,0.482667,0.383000,0.500000,0.411000,0.390333,0.150,0.323333
