# setup

In [16]:
# run all of this
import os
import pandas as pd
import re
from datetime import datetime
import requests
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import nbformat as nbf
import plotly.io as pio

# collect() function to scrape data
def collect(player):
    global stats
    
    # convert start_year to integer
    start = 2021
    current = datetime.now().year

    # base URL for basketball-reference.com
    base_url = "https://www.basketball-reference.com/players"

    # splitting the full name
    name_parts = [part.lower() for part in player.split()]

    # get pieces
    first_name = name_parts[0]
    last_name = name_parts[-1]
    last_name_first = last_name[0]
    first_two_first = first_name[0:2]
    first_five_last = last_name[0:5]

    # connect pieces
    player_code = f"{first_five_last}{first_two_first}01"

    # create df
    stats = pd.DataFrame()

    # loop through years
    for year in range(start - 1, current + 2):
        url = f"{base_url}/{last_name_first}/{player_code}/gamelog/{year}"

        # send request
        response = requests.get(url)

        # if success:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
         # Finding the table with the stats
            table = soup.find('table', {'id': 'pgl_basic'})

            # If the table is found
            if table:
                # Extracting the table headers
                headers = [th.getText() for th in table.find_all('tr', limit=2)[0].find_all('th')]
                headers = headers[1:]  # Remove the first header, which is usually empty

                # Extracting the rows from the table
                rows = table.find_all('tr')[1:]
                player_stats = [[td.getText() for td in rows[i].find_all('td')]
                                for i in range(len(rows))]

                # Check if the number of columns matches
                if len(headers) == len(player_stats[0]):
                    # Creating a DataFrame for the current year
                    df = pd.DataFrame(player_stats, columns=headers)

                    # Drop rows with mismatched columns
                    df = df.dropna(axis=0, subset=headers)

                    # Appending the data to the all_stats DataFrame
                    stats = pd.concat([stats, df], ignore_index=True)
        else:
            print(f"Skipping year {year}. Request failed with status code {response.status_code}")

    return stats

# random ETL functions
def to_numeric(df):
    return df.apply(pd.to_numeric, errors='ignore')

def missing_vals(df):
    return df.dropna(axis=0, thresh=10)

# make lower case
def make_cols_lower(df):
    df.columns = map(str.lower, df.columns)
    return df

def naming(df):
    df.rename(columns={df.columns[6]: 'margin'},  inplace=True)
    df.rename(columns = {'trb': 'reb'}, inplace=True)
    return df

def convert_mp(df):
    df['mp'] = [time.split(':')[0] if isinstance(time, str) else time for time in df['mp']]
    df['mp'] = df['mp'].astype(int)
    
    return df

def deduplicate(df):
    df = df.drop_duplicates()
    return df

def mutate(df):
    # convert - to .
    df['age'] = df['age'].str.replace('-', '.')
    # convert to float
    df['age'] = df['age'].astype(float)
    # convert to datetime
    df['date'] = pd.to_datetime(df['date'])

    return df

# creating home/away, W/L, and margin of w/l cols
def split(df):
    # isolating unnamed cols
    t1 = df.iloc[:,0:df.columns.get_loc('opp')]
    aways = t1['margin'].values.tolist()
    # now isolating margin of win/wl
    t2 = df.iloc[:,df.columns.get_loc('opp'):]
    margins = t2['margin'].values.tolist()

    return aways, margins

# use splits to make new cols
def use_split(df):
    a, m = split(df)

    homes = []
    wins = []

    # loop through once for home/away
    for i in a:
        # if home
        if '@' in i:
            homes.append('N')
        else:
            homes.append('Y')
    # now for win/loss
    for i in m:
        # if win
        if 'W' in i:
            wins.append('Y')
        else:
            wins.append('N')

    mg = [int(item.split()[1].strip("()+")) if item is not None else 0 for item in m]

    # append homes/wins/m as new cols to 'x'
    df['home'] = homes
    df['win'] = wins
    df['by'] = mg
    df = df.drop(columns=['margin'])
    
    return df

# making summed cols (pts + reb, etc.)

# pts + reb + ast column
def points_reb_ast(df):
    ## points + reb + ast
    x = df['pts'].tolist()
    y = df['reb'].tolist()
    z = df['ast'].tolist()
    # sum each element in xyz
    xyz = [sum(i) for i in zip(x, y, z)]
    df['p_r_a'] = xyz

    return df

# blk + stl column
def blk_stl(df):
    # blk + stl
    x = df['blk'].tolist()
    y = df['stl'].tolist()
    # sum each element in xy
    xy = [sum(i) for i in zip(x, y)]
    df['b_s'] = xy

    return df

# pts + ast
def pts_ast(df):
    # pts + ast
    x = df['pts'].tolist()
    y = df['ast'].tolist()
    # sum each element in xy
    xy = [sum(i) for i in zip(x, y)]
    df['p_a'] = xy

    return df

def p_r(df):
    df['p_r'] = df['reb'] + df['pts']
    return df

def r_a(df):
    df['r_a'] = df['reb'] + df['ast']
    return df

# encode time into features
def encode_dates(df):
    # convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    # extract year, month, day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    # get day of week (1=mon, 7=sun)
    df['weekday'] = df['date'].dt.isocalendar().day
    # Determine if the date is on a weekend (Saturday or Sunday)
    df['weekend'] = (df['date'].dt.weekday >= 5).astype(int)

    return df

# encode rest of columns
# getting object cols
def get_obj_cols(df):
    tempo = df.select_dtypes(include=['object'])
    return tempo

# encode temp_df
def encoder(df):
    global opps
    opps = df['opp']
    df = encode_dates(df)
    # get object cols into temp_df
    tempo = get_obj_cols(df)
    # encode
    le = LabelEncoder()
    tempo = tempo.apply(le.fit_transform)
    # drop old cols
    df = df.drop(columns=tempo.columns)
    # add new cols
    df = pd.concat([df, tempo], axis=1)
    # rename opps
    df['opp'] = opps
    
    return df

# DOUBLE DOUBle
def dbl(df):
    # Function to check if a row has a double-double
    def has_double_double(row):
        return sum(row[col] >= 10 for col in ['pts', 'reb', 'ast']) >= 2

    # Apply the function to each row and assign 0 or 1
    df['dbl'] = df.apply(has_double_double, axis=1).astype(int)
    return df

# etl function refactoring
def etl(df):
    # make copy of df
    copy = df.copy()
    
    # run functions
    copy = to_numeric(copy)
    copy = missing_vals(copy)
    copy = make_cols_lower(copy)
    copy = naming(copy)
    copy = convert_mp(copy)
    ## optional
    ## copy = encoder(copy)
    copy = use_split(copy)
    copy = deduplicate(copy)
    copy = mutate(copy)
    copy = points_reb_ast(copy)
    copy = blk_stl(copy)
    copy = pts_ast(copy)
    copy = p_r(copy)
    copy = r_a(copy)
    copy = dbl(copy)
    
    return copy

# scatter() of stat vs time
def scatter(df, stat, line):
    fig = px.scatter(df, x='date', y=stat, color = 'opp', template="plotly_dark")
    fig = fig.add_hline(y=line, line_dash="dash", annotation_text="line", annotation_position="bottom right")

    return fig.show()

# usage 

In [17]:
p = input("enter player name: ")
# practice
og = collect(p)

df = etl(og)


In [18]:
# scatterplot usage
# get user inputs
print(df.columns.tolist())
stat = input('pick a stat (see choices below): ')
line = float(input("Enter the line: "))

# plot
scatter(df, stat, line)

['g', 'date', 'age', 'tm', 'opp', 'gs', 'mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'reb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'gmsc', '+/-', 'home', 'win', 'by', 'p_r_a', 'b_s', 'p_a', 'p_r', 'r_a', 'dbl']
