In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date
from datetime import timedelta
from time import sleep

In [2]:
box_score_url = "http://www.basketball-reference.com/boxscores/201610250CLE.html"
soup = BeautifulSoup(urlopen(box_score_url), 'lxml')

In [3]:
# Get all the data rows from the webpage in bs4.Tag form
data_rows = soup.findAll('tr')

In [140]:
# Structure of the data rows

for i in range(len(data_rows)):
    #print(data_rows[i].findAll('th')[0].getText())
    player_name = data_rows[i].findAll('th')[0].getText()
    
    print(i, player_name)

0 
1 Starters
2 Kristaps Porzingis
3 Carmelo Anthony
4 Derrick Rose
5 Courtney Lee
6 Joakim Noah
7 Reserves
8 Justin Holiday
9 Brandon Jennings
10 Lance Thomas
11 Kyle O'Quinn
12 Willy Hernangomez
13 Mindaugas Kuzminskas
14 Ron Baker
15 Sasha Vujacic
16 Team Totals
17 
18 Starters
19 Kristaps Porzingis
20 Carmelo Anthony
21 Derrick Rose
22 Courtney Lee
23 Joakim Noah
24 Reserves
25 Justin Holiday
26 Brandon Jennings
27 Lance Thomas
28 Kyle O'Quinn
29 Willy Hernangomez
30 Mindaugas Kuzminskas
31 Ron Baker
32 Sasha Vujacic
33 Team Totals
34 
35 Starters
36 LeBron James
37 Kyrie Irving
38 J.R. Smith
39 Kevin Love
40 Tristan Thompson
41 Reserves
42 Richard Jefferson
43 Mike Dunleavy
44 Channing Frye
45 Iman Shumpert
46 DeAndre Liggins
47 Jordan McRae
48 Chris Andersen
49 James Jones
50 Team Totals
51 
52 Starters
53 LeBron James
54 Kyrie Irving
55 J.R. Smith
56 Kevin Love
57 Tristan Thompson
58 Reserves
59 Richard Jefferson
60 Mike Dunleavy
61 Channing Frye
62 Iman Shumpert
63 DeAndre Ligg

In [4]:
# Get the main stat column headers
main_column_headers = [th.getText() for th in 
                  data_rows[1].findAll('th')]


# Get the advanced stat column headers
empty_row = None
for i in range(1,len(data_rows)): # Skip the first empty row because it preceeds main column headers
    #print(data_rows[i].findAll('th')[0].getText())
    player_name = data_rows[i].findAll('th')[0].getText()
    
    if player_name == "":
        empty_row = i
        break

advanced_column_row = empty_row + 1

other_column_headers = [th.getText() for th in data_rows[advanced_column_row].findAll('th')]

In [5]:
print(main_column_headers)
print(other_column_headers)

['Starters', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-']
['Starters', 'MP', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg']


In [6]:
# Iterate all the data rows and store the information in player_data
player_data = [[data_rows[i].findAll('th')[0].getText()] + [td.getText() for td in data_rows[i].findAll('td')] 
               for i in range(len(data_rows))]

# Ignore first empty row and second row of column values
player_data = player_data[2:]

In [7]:
player_data[1] # First two values are [''] and ['Starters']

['Carmelo Anthony',
 '30:05',
 '8',
 '18',
 '.444',
 '1',
 '4',
 '.250',
 '2',
 '2',
 '1.000',
 '1',
 '4',
 '5',
 '3',
 '1',
 '0',
 '4',
 '5',
 '19',
 '-19']

In [10]:
# Turning player data into a dataframe
df = pd.DataFrame(player_data, columns=main_column_headers)

In [11]:
df

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,Kristaps Porzingis,32:48,5,13,.385,3,5,.600,3,6,...,4,3,7,0,1,2,2,5,16,-21
1,Carmelo Anthony,30:05,8,18,.444,1,4,.250,2,2,...,1,4,5,3,1,0,4,5,19,-19
2,Derrick Rose,29:35,7,17,.412,1,2,.500,2,2,...,2,1,3,1,0,1,4,1,17,-19
3,Courtney Lee,19:37,0,5,.000,0,2,.000,0,0,...,1,2,3,0,0,1,2,1,0,-19
4,Joakim Noah,19:31,0,1,.000,0,0,,0,0,...,1,5,6,3,1,0,1,1,0,-1
5,Reserves,,,,,,,,,,...,,,,,,,,,,
6,Justin Holiday,26:27,2,7,.286,2,3,.667,2,2,...,1,5,6,2,1,0,0,0,8,-14
7,Brandon Jennings,21:14,1,7,.143,0,4,.000,5,6,...,1,2,3,5,0,0,1,2,7,-14
8,Lance Thomas,16:37,1,4,.250,0,0,,0,0,...,0,1,1,1,2,0,1,2,2,-11
9,Kyle O'Quinn,15:24,1,3,.333,0,2,.000,0,0,...,1,4,5,1,0,2,1,3,2,-10


In [12]:
# Getting the empty row indices that divide between basic and advanced stats

first_empty, second_empty, third_empty = df[df['Starters'] == ''].index.tolist()
print(first_empty, second_empty, third_empty)

15 32 49


In [13]:
# Getting regular stats for the first team

regular_stats_team_1 = df.iloc[:first_empty]
# Removing the useless reserves rows
regular_stats_team_1 = regular_stats_team_1.loc[df['Starters'] != 'Reserves']

# Add date column for indexing by player
regular_stats_team_1['Date'] = date(2016, 10, 25)

regular_stats_team_1

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
0,Kristaps Porzingis,32:48,5,13,0.385,3,5,0.6,3,6,...,3,7,0,1,2,2,5,16,-21.0,2016-10-25
1,Carmelo Anthony,30:05,8,18,0.444,1,4,0.25,2,2,...,4,5,3,1,0,4,5,19,-19.0,2016-10-25
2,Derrick Rose,29:35,7,17,0.412,1,2,0.5,2,2,...,1,3,1,0,1,4,1,17,-19.0,2016-10-25
3,Courtney Lee,19:37,0,5,0.0,0,2,0.0,0,0,...,2,3,0,0,1,2,1,0,-19.0,2016-10-25
4,Joakim Noah,19:31,0,1,0.0,0,0,,0,0,...,5,6,3,1,0,1,1,0,-1.0,2016-10-25
6,Justin Holiday,26:27,2,7,0.286,2,3,0.667,2,2,...,5,6,2,1,0,0,0,8,-14.0,2016-10-25
7,Brandon Jennings,21:14,1,7,0.143,0,4,0.0,5,6,...,2,3,5,0,0,1,2,7,-14.0,2016-10-25
8,Lance Thomas,16:37,1,4,0.25,0,0,,0,0,...,1,1,1,2,0,1,2,2,-11.0,2016-10-25
9,Kyle O'Quinn,15:24,1,3,0.333,0,2,0.0,0,0,...,4,5,1,0,2,1,3,2,-10.0,2016-10-25
10,Willy Hernangomez,9:26,2,3,0.667,0,0,,0,0,...,0,1,0,0,0,1,1,4,-3.0,2016-10-25


In [14]:
# Getting advanced stats for the second team
advanced_stats_team_1 = df.iloc[first_empty+2:second_empty]

# Eliminating the reserves row
advanced_stats_team_1 = advanced_stats_team_1.loc[advanced_stats_team_1['Starters'] != 'Reserves']

# Drop the last 5 columns that are empty for these advanced stats
advanced_stats_team_1 = advanced_stats_team_1.dropna(axis=1)

# Change the column values
advanced_stats_team_1.columns = advanced_stat_column_headers

advanced_stats_team_1

NameError: name 'advanced_stat_column_headers' is not defined

In [15]:
# Getting regular stats for team 2
regular_stats_team_2 = df.iloc[second_empty+2:third_empty]

# Removing reserves row
regular_stats_team_2 = regular_stats_team_2.loc[regular_stats_team_2['Starters'] != 'Reserves']

# Add date column for indexing by player
regular_stats_team_2['Date'] = date(2016, 10, 25)

regular_stats_team_2

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,Date
34,LeBron James,32:23,9,14,0.643,0,3,0.0,1,2,...,8,11,14,0,1,4,3,19,26.0,2016-10-25
35,Kyrie Irving,29:48,12,22,0.545,4,7,0.571,1,1,...,1,2,4,2,0,0,1,29,13.0,2016-10-25
36,J.R. Smith,25:14,3,13,0.231,2,8,0.25,0,0,...,3,3,2,1,0,0,1,8,13.0,2016-10-25
37,Kevin Love,24:58,6,15,0.4,2,6,0.333,9,12,...,10,12,2,3,0,2,3,23,24.0,2016-10-25
38,Tristan Thompson,22:25,0,1,0.0,0,0,,0,0,...,4,6,0,0,0,2,4,0,17.0,2016-10-25
40,Richard Jefferson,25:47,5,7,0.714,2,3,0.667,1,2,...,4,4,1,2,0,2,1,13,17.0,2016-10-25
41,Mike Dunleavy,22:32,2,3,0.667,0,1,0.0,0,0,...,4,4,2,3,0,0,0,4,19.0,2016-10-25
42,Channing Frye,14:09,2,6,0.333,2,5,0.4,0,0,...,2,4,0,1,1,1,4,6,7.0,2016-10-25
43,Iman Shumpert,13:51,1,3,0.333,0,1,0.0,0,0,...,1,2,3,0,1,2,2,2,-5.0,2016-10-25
44,DeAndre Liggins,12:00,2,3,0.667,0,0,,0,0,...,3,3,3,0,1,1,1,4,11.0,2016-10-25


In [41]:
player_stats_team_2 = regular_stats_team_2['Starters'].tolist()[:-1]
player_stats_team_2

['LeBron James',
 'Kyrie Irving',
 'J.R. Smith',
 'Kevin Love',
 'Tristan Thompson',
 'Richard Jefferson',
 'Mike Dunleavy',
 'Channing Frye',
 'Iman Shumpert',
 'DeAndre Liggins',
 'Jordan McRae',
 'Chris Andersen',
 'James Jones']

In [16]:
# Getting advanced stats for team 2
advanced_stats_team_2 = df.iloc[third_empty+2:]

# Eliminating the reserves row
advanced_stats_team_2 = advanced_stats_team_2.loc[advanced_stats_team_2['Starters'] != 'Reserves']

# Drop the last 5 columns that are empty for these advanced stats
advanced_stats_team_2 = advanced_stats_team_2.dropna(axis=1)

# Change columns
advanced_stats_team_2.columns = advanced_stat_column_headers

print(advanced_stats_team_2.shape)
advanced_stats_team_2

NameError: name 'advanced_stat_column_headers' is not defined

In [18]:
# Horizontally concatenating the two stats dataframes for each team

all_stats_team_1 = regular_stats_team_1.merge(advanced_stats_team_1, left_on='Starters', right_on='Starters', how='inner')
all_stats_team_2 = regular_stats_team_2.merge(advanced_stats_team_2, left_on='Starters', right_on='Starters', how='inner')


print(all_stats_team_1)
print(all_stats_team_2)

#all_stats_team_1 = pd.concat([regular_stats_team_1, advanced_stats_team_1], axis=1, jo)
#all_stats_team_1

                Starters   MP_x FG_x FGA_x FG%_x 3P_x 3PA_x  3P%_x FT_x FTA_x  \
0     Kristaps Porzingis  32:48    5    13  .385    3     5   .600    3     6   
1        Carmelo Anthony  30:05    8    18  .444    1     4   .250    2     2   
2           Derrick Rose  29:35    7    17  .412    1     2   .500    2     2   
3           Courtney Lee  19:37    0     5  .000    0     2   .000    0     0   
4            Joakim Noah  19:31    0     1  .000    0     0           0     0   
5         Justin Holiday  26:27    2     7  .286    2     3   .667    2     2   
6       Brandon Jennings  21:14    1     7  .143    0     4   .000    5     6   
7           Lance Thomas  16:37    1     4  .250    0     0           0     0   
8           Kyle O'Quinn  15:24    1     3  .333    0     2   .000    0     0   
9      Willy Hernangomez   9:26    2     3  .667    0     0           0     0   
10  Mindaugas Kuzminskas   9:14    3     5  .600    1     3   .333    0     0   
11             Ron Baker   6

In [28]:
# Rearrange columns in the two dataframes
columns = all_stats_team_2.columns.tolist()
columns.remove('Date')
columns.remove('Starters')


all_stats_team_1 = all_stats_team_1[['Starters','Date'] + columns]
all_stats_team_2 = all_stats_team_2[['Starters','Date'] + columns]

team_1_players = all_stats_team_1['Starters'].tolist()[:-1]
team_2_players = all_stats_team_2['Starters'].tolist()[:-1]

In [20]:
# Getting total team data for the two teams that played

team_1_stats = all_stats_team_1.iloc[-1:]
team_2_stats = all_stats_team_2.iloc[-1:]

team_1_stats = team_1_stats.rename(columns={"Starters": "Team Name"})
team_2_stats = team_2_stats.rename(columns={"Starters": "Team Name"})

In [25]:
# Parse HTML to get the names of the teams
div = soup.findAll('div', {"class":"box"})
headline = x = div[0].contents[1].contents[0]
# Away team is always team 1, listed first
team_1_name = headline[:(headline.find(' at '))]
team_2_name = headline[(headline.find(' at ')+4):(headline.find(' Box '))]

In [26]:
team_1_name, team_2_name

('New York Knicks', 'Cleveland Cavaliers')

In [31]:
game_dict = {}
game_dict['Date'] = date(2016, 10, 25)
game_dict['Team_1'] = team_1_name
game_dict['Team_2'] = team_2_name
game_dict['Team_1_Players'] = team_1_players
game_dict['Team_2_Players'] = team_2_players
game_dict

{'Date': datetime.date(2016, 10, 25),
 'Team_1': 'New York Knicks',
 'Team_1_Players': ['Kristaps Porzingis',
  'Carmelo Anthony',
  'Derrick Rose',
  'Courtney Lee',
  'Joakim Noah',
  'Justin Holiday',
  'Brandon Jennings',
  'Lance Thomas',
  "Kyle O'Quinn",
  'Willy Hernangomez',
  'Mindaugas Kuzminskas',
  'Ron Baker',
  'Sasha Vujacic'],
 'Team_2': 'Cleveland Cavaliers',
 'Team_2_Players': ['LeBron James',
  'Kyrie Irving',
  'J.R. Smith',
  'Kevin Love',
  'Tristan Thompson',
  'Richard Jefferson',
  'Mike Dunleavy',
  'Channing Frye',
  'Iman Shumpert',
  'DeAndre Liggins',
  'Jordan McRae',
  'Chris Andersen',
  'James Jones']}

In [36]:
import _pickle as pickle
with open('game_test.p', 'wb') as fp:
    pickle.dump(game_dict, fp)

In [37]:
with open('game_test.p', 'rb') as fp:
    pickle_game = pickle.load(fp)

In [38]:
pickle_game

{'Date': datetime.date(2016, 10, 25),
 'Team_1': 'New York Knicks',
 'Team_1_Players': ['Kristaps Porzingis',
  'Carmelo Anthony',
  'Derrick Rose',
  'Courtney Lee',
  'Joakim Noah',
  'Justin Holiday',
  'Brandon Jennings',
  'Lance Thomas',
  "Kyle O'Quinn",
  'Willy Hernangomez',
  'Mindaugas Kuzminskas',
  'Ron Baker',
  'Sasha Vujacic'],
 'Team_2': 'Cleveland Cavaliers',
 'Team_2_Players': ['LeBron James',
  'Kyrie Irving',
  'J.R. Smith',
  'Kevin Love',
  'Tristan Thompson',
  'Richard Jefferson',
  'Mike Dunleavy',
  'Channing Frye',
  'Iman Shumpert',
  'DeAndre Liggins',
  'Jordan McRae',
  'Chris Andersen',
  'James Jones']}

In [32]:
# The ends of the boxscore URLS
team_abbreviations=['ATL', 
					'BRK', 
					'BOS',
					'CHO',
					'CHI',
					'CLE',
					'DAL',
					'DEN',
					'DET',
					'GSW',
					'HOU',
					'IND',
					'LAC',
					'LAL',
					'MEM',
					'MIA',
					'MIL',
					'MIN',
					'NOP',
					'NYK',
					'OKC',
					'ORL',
					'PHI',
					'PHO',
					'POR',
					'SAC',
					'SAS',
					'TOR',
					'UTA',
					'WAS']

In [172]:
# Start and end dates must be actual dates: Example: start_date = Date(2016, 10, 28)
def generate_date_strings(start_date, end_date):

	output_day_strings = []

	delta = end_date - start_date

	for i in range(delta.days+1):
		current_date = start_date + timedelta(days=i)
		current_date_string = str(current_date.year) + '{:02d}'.format(current_date.month) + '{:02d}'.format(current_date.day)
		output_day_strings.append(current_date_string)

	return output_day_strings

In [173]:
def scrape_training_data(start_date, end_date):

	base_url_string = 'http://www.basketball-reference.com/boxscores/'

	date_strings = generate_date_strings(start_date, end_date)

	# Generate each possible string with all 30 teams. Teams that don't have a game that day will just be pass over with try, except
	# TODO: Possibly find a cleaner way to do this

	for ds in date_strings:

		for abbr in team_abbreviations:

			url_to_try = base_url_string + ds + '0' + abbr + '.html'
			try:
				test_soup = BeautifulSoup(urlopen(url_to_try), "lxml")
				test_soup = BeautifulSoup(urlopen(url_to_try), "lxml")
				print("Found game with ", abbr)
			except:
				continue
			sleep(0.5)

In [174]:
d = date(2016,11,5)

In [175]:
scrape_training_data(d, d)

Found game with  ATL
Found game with  DET
Found game with  IND
Found game with  MIL
Found game with  OKC
Found game with  ORL
Found game with  PHI
Found game with  SAS
