In [1]:
import sys

!conda install --yes --prefix {sys.prefix} pandas bs4

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import string

def get_tables(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    tables = soup.find_all("table", class_="wikitable sortable", recursive=True)
    
    return tables

def get_nba_tables(url):
    tables = get_tables(url)
    
    nba_tables = []
    for table in tables:
        ths = table.find_all("th")
        for th in ths:
            if th.text == "PPG\n":
                nba_tables.append(table)
                break
    
    return nba_tables

def clean_str(s):
    s = s.rstrip("\n")
    s = s.lstrip()
    s = s.lstrip("-")
    s = s.replace("*", "")
    s = s.replace("†", "")
    
    nan_values = ["–", "—", "...", "NaN."]
    for nan_value in nan_values:
        if s.startswith(nan_value): s = s.replace(nan_value, "NaN")
    
    if len(s) == 0: s = "NaN"
    
    return s

def create_df_from_table(table):
    ths = table.find_all("th")
    headers = [th.get_text().rstrip('\n') for th in ths]
    
    trs = table.find("tbody").find_all("tr", class_="")
    # Get all rows in the table in a nested array format
    rows = [[clean_str(td.get_text()) for td in tr.find_all("td")] for tr in trs][1:]
    
    df = pd.DataFrame(data=rows, columns=headers)
    
    return df

def get_dfs_from_tables(tables):
    dfs = []
    for table in tables:
        df = create_df_from_table(table)
        dfs.append(df)
    
    return dfs

In [13]:
def get_best_players():
    return [line.rstrip("\n") for line in open("./data/players.txt").readlines()]

def get_player_url(player_name):
    return "https://en.wikipedia.org/wiki/" + player_name.replace(" ", "_")

def fix_dtypes(df):
    def get_dtype(col_name):
        if col_name == "Year" or col_name == "Team" or col_name == "Season" or col_name == "League":
            return "string"
        else:
            return "float64"
    
    for col_name in df.columns:
        col = df[col_name]
        df[col_name] = col.astype(get_dtype(col_name))

def get_and_write_data(player_name):
    def get_name(i):
        if i == 0:
            return "Regular season"
        elif i == 1:
            return "Playoffs"
        else:
            return "Unknown_" + str(i - 1)
    
    url = get_player_url(player_name)
    dfs = get_dfs_from_tables(get_nba_tables(url))
    
    print("Found " + str(len(dfs)) + " dataset(s) about " + player_name + "!")
    
    for i, df in enumerate(dfs):
        fix_dtypes(df)
        file_name = "./data/" + player_name + "_" + get_name(i) + ".csv"
        file_name = file_name.replace(" ", "_").replace("'", "")
        df.to_csv(file_name)
    
    print("Wrote dataset(s) about " + player_name + "!")

for player in get_best_players():
    get_and_write_data(player)

print("Done!")

Found 3 dataset(s) about Kevin Durant!
Wrote dataset(s) about Kevin Durant!
Found 2 dataset(s) about Lebron James!
Wrote dataset(s) about Lebron James!
Found 2 dataset(s) about Giannis Antetokounmpo!
Wrote dataset(s) about Giannis Antetokounmpo!
Found 2 dataset(s) about Kobe Bryant!
Wrote dataset(s) about Kobe Bryant!
Found 2 dataset(s) about Michael Jordan!
Wrote dataset(s) about Michael Jordan!
Found 2 dataset(s) about Shaquille ONeal!
Wrote dataset(s) about Shaquille ONeal!
Found 2 dataset(s) about Wilt Chamberlain!
Wrote dataset(s) about Wilt Chamberlain!
Found 2 dataset(s) about Bill Russel!
Wrote dataset(s) about Bill Russel!
Found 2 dataset(s) about Oscar Robertson!
Wrote dataset(s) about Oscar Robertson!
Found 2 dataset(s) about David Robinson!
Wrote dataset(s) about David Robinson!
Found 3 dataset(s) about Russel Westbrook!
Wrote dataset(s) about Russel Westbrook!
Found 3 dataset(s) about Damian Lillard!
Wrote dataset(s) about Damian Lillard!
Found 3 dataset(s) about Dominique