In [None]:
import csv
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup, Comment
from fake_useragent import UserAgent
import time
import requests, re
import random
import string
import pickle

In [None]:
# List of abbreviated teams from basketball-reference.com

teams = ['ATL', 'BOS', 'NJN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
        'MIL', 'MIN', 'NOH', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

## Scraping basketball-reference.com

### Getting URLs of each NBA team first

In [None]:
# Function to scrape urls of NBA teams

us = UserAgent()
user_agent = {'User-Agent':us.random}

def get_team_info(team):
    url = 'https://www.basketball-reference.com/teams/'+team+'/'
    response = requests.get(url, headers=user_agent)
    page = response.text
    print (url)
    time.sleep(1)
    return page

In [None]:
# Scraping basketball-reference.com for their NBA teams

team_by_year_info = {}

for team in teams:
    soup = BeautifulSoup(get_team_info(team), 'lxml')
    team_by_year_info[team] = soup

print (len(team_by_year_info))

### Getting URLs of each NBA team's season

In [None]:
# From the soup objects of each NBA team, scraping the urls of each season for each NBA team

real_team_player_info = {}

for team, soup in team_by_year_info.items():
    teams_info = soup.table.tbody
    
    rows = teams_info.find_all('tr')
    
    for row in rows[1:]:
        year = row.find('th').find('a').text[0:4]
        year = int(year)+1
        final_year = str(year)
        
        url = 'https://www.basketball-reference.com/teams/'+team+'/'+final_year+'.html'
        response = requests.get(url, headers=user_agent)
        page = response.text
        
        real_team_player_info[team + '_' + final_year] = BeautifulSoup(page, 'lxml')

        print (url)
        time.sleep(random.randint(1,3))

### Getting each player's info and stats from each soup object 

In [None]:
full_player_info = []

In [None]:
# Scraping all the urls of each NBA team's season to get all the players' statistics
#   information

for team_year, info in real_team_player_info.items():

# Use this if statement since some of the websites I scraped don't have a page, so this is 
#   a precautionary measure to only grab all the information when the page provides it

    if len(info.find('div', {'id':'content'}).find_all('div')) > 2:

        team_year_player_info = {}
    
        players = info.table
        player_info = [row for row in players.find_all('tr')]
        for i in player_info[1:]:
            player_name = i.find('td').text
            info_on_player = i.find_all('td')

            team_year_player_info[team_year+'_'+player_name] = [i.text for i in info_on_player if i.text != player_name]

# Lots of the statistics information on each page are commented out so I have to use the
#    Comment from the package bs4 to be able to grab all the season, advanced, shooting,
#    and salary stats for each player

        comments = info.find_all(text=lambda text:isinstance(text, Comment))

        rx_stat = re.compile(r'<table.+?id="per_game".+?>[\s\S]+?</table>')
        rx_adv = re.compile(r'<table.+?id="advanced".+?[\s\S]+?</table>')
        rx_shot = re.compile(r'<table.+?id="shooting".+?>[\s\S]+?</table>')
        rx_sal = re.compile(r'<table.+?id="salaries2".+?>[\s\S]+?</table>')

# Using try and except statements for each commented table I want to grab because some pages
#    don't have every statistic table I want

        for comment in comments:
            try:
                stat_table = rx_stat.search(comment.string).group(0)
                stats_table = BeautifulSoup(stat_table, 'lxml')
                stats_info = [p for p in stats_table.find_all('tr')]

                for i in stats_info[1:]:
                    player_name_stats = i.find('td').text
                    stats = i.find_all('td')
                    real_stats = [rs.text for rs in stats]
            
                    for key, value in team_year_player_info.items():
                        if player_name_stats in key:
                            team_year_player_info[key].extend(real_stats[1:])
            except:
                pass
            
            try:
                adv_table = rx_adv.search(comment.string).group(0)
                ad_table = BeautifulSoup(adv_table, 'lxml')
                adv_info = [a for a in ad_table.find_all('tr')]

                for i in adv_info[1:]:
                    player_name_adv = i.find('td').text
                    adv = i.find_all('td')
                    real_adv = [ad.text for ad in adv]

                    for key, value in team_year_player_info.items():
                        if player_name_adv in key:
                            team_year_player_info[key].extend(real_adv[1:])       
            except:
                pass
            
            try:
                shot_table = rx_shot.search(comment.string).group(0)
                sh_table = BeautifulSoup(shot_table, 'lxml')
                shot_info = [sh for sh in sh_table.find_all('tr')]

                for i in shot_info[3:]:
                    player_name_shot = i.find('td').text
                    shooting = i.find_all('td')
                    real_shooting = [sh.text for sh in shooting]
     
                    for key, value in team_year_player_info.items():
                        if player_name_shot in key:
                            team_year_player_info[key].extend(real_shooting[1:])
            except:
                pass
            
            try:
                sal_table = rx_sal.search(comment.string).group(0)
                salary_table = BeautifulSoup(sal_table, 'lxml')
                salary_info = [s for s in salary_table.find_all('tr')]

                for i in salary_info[1:]:
                    player_name_sal = i.find('td').text
                    salary = i.find('td').nextSibling.text

                    for key, value in team_year_player_info.items():
                        if player_name_sal in key:
                            team_year_player_info[key].append(salary)               
            except:
                pass

# Uploading each roster of players' statistics into a list

        full_player_info.append(team_year_player_info)
    
        print (team_year+'_'+player_name)
    else:
        print (team_year)

### Organizing the full_player_info list into a complete dictionary of a player/his season/his team and his stats 

In [None]:
# Uploading the list of NBA team's rosters from each season players' statistics into one giant
#    dictionary to easily look through each player's stats

final_player_info = {}
for player in full_player_info:
    final_player_info.update(player)

In [None]:
# Using this for loop to add on any "Not Applicable" values to NBA players whose stats don't
#    add up to 84. I add these NA values because some of the webpages don't have all of the
#    statistic tables, so some players will have less than the 84 total statistic values.

for key, value in final_player_info.items():
    i = len(value)
    if i < 84:
        while i < 84:
            value.append('NA')
            i += 1
    if i > 84:
        del value[84:]
    final_player_info[key] = value

In [None]:
# I switch the salary in the list of each dictionary value to the last value on the list if 
#    it isn't already

for key, value in final_player_info.items():
    for i in value:
        if '$' in i:
            salary_index = value.index(i)
        else:
            salary_index = 'no_money_info'
    if type(salary_index) == int:
        value[salary_index], value[-1] = value[-1], value[salary_index]
    else:
        continue

In [None]:
# Getting rid of the '$' and ',' in the salary values in every player's stats

for key, value in final_player_info.items():
    if '$' in value[-1]:
        value[-1] = ''.join([c for c in value[-1] if c not in ('$', ',')])
    else:
        continue

### Writing the final_player_info into a pickle file

In [None]:
# Dumping the large dictionary file into a pickle object so I don't have to scrape basketball-reference.com
#    multiple times.

with open('saved_final_player_info.pickle', 'wb') as handle:
    pickle.dump(final_player_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Converting player_stats dictionary into a pandas dataframe

In [None]:
# Names of each column

names = ['position', 'height', 'weight', 'D_of_B', 'Country', 'Exp', 'College', 'Age', 'games_played', 'games_started',
        'min_played/G', 'fg_made/G', 'fg_att/G', 'fg%', '3PM/G', '3PA/G', '3P%', '2PM/G', '2PA/G', '2P%', 'eFG%', 'FTM/G',
         'FTA/G', 'FT%','ORB/G', 'DRB/G', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G', 'PTS/G', 'Age2', 'games_played2',
        'min_played_total', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'random1',
        'OWS', 'DWS', 'WS', 'WS/48', 'random2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Age3', 'games_played3', 'min_played_total2', 'FG%2', 'Dist', '%_shots_2PA',
        '2PA%_0-3', '2PA%_3-10', '2PA%_10-16', '2PA%_16<3', '3PA%', '%_shots_2PM', '2PM%_0-3', '2PM%_3-10', '2PM%_10-16',
        '2PM%_16<3', '3PM%', '%Astd_forFGM', '%_FGA_dunks', 'dunks', '%Astd_for3FGM', '%_3PA_corner', '3PM%_corner',
        'heaves_att', 'heaves_made', 'salary']

In [None]:
# Checking to see if loading the dictionary into a pandas dataframe works

player_df = pd.DataFrame(final_player_info, index=names)
player_df_original = player_df.T
player_df_original