In [247]:
import pandas as pd
import numpy as np
import math

pbp = pd.read_html('https://www.basketball-reference.com/boxscores/pbp/201711130LAC.html',header=1)[0]
print(pbp.shape)
pbp_raw = pbp.copy()
pbp_raw.head()

(506, 6)


Unnamed: 0,Time,Philadelphia,Unnamed: 2,Score,Unnamed: 4,LA Clippers
0,12:00.0,Start of 1st quarter,,,,
1,12:00.0,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,,,,
2,11:44.0,J. Redick misses 3-pt shot from 25 ft,,0-0,,
3,11:41.0,,,0-0,,Defensive rebound by D. Jordan
4,11:33.0,,,0-3,3.0,A. Rivers makes 3-pt shot from 27 ft (assist b...


In [248]:
awayteam = pbp.columns[1]
hometeam = pbp.columns[5]
# renaming the columns from above
pbp.columns = ['time', 'awayevents','awaypts','score','homepts','homeevents']

print(awayteam,hometeam)

# adding columns to specify away and home teams
pbp['awayteam'] = awayteam
pbp['hometeam'] = hometeam

# adding events columns -- combines awayevents and homevents to give full list of events
events = pbp['awayevents']
events = events.fillna(pbp['homeevents'])
pbp['event'] = events

# adding isawayevent column -- specifies if row refers to event of awayteam or not
pbp['isawayevent'] = 1-pd.isnull(pbp['awayevents'])

pbp.head()

Philadelphia LA Clippers


Unnamed: 0,time,awayevents,awaypts,score,homepts,homeevents,awayteam,hometeam,event,isawayevent
0,12:00.0,Start of 1st quarter,,,,,Philadelphia,LA Clippers,Start of 1st quarter,1
1,12:00.0,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,,,,,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1
2,11:44.0,J. Redick misses 3-pt shot from 25 ft,,0-0,,,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1
3,11:41.0,,,0-0,,Defensive rebound by D. Jordan,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0
4,11:33.0,,,0-3,3.0,A. Rivers makes 3-pt shot from 27 ft (assist b...,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0


In [249]:

#first need to replace scores at the beginning of games and the rows that just mark the end of quarters 
pbp['score'] = pbp['score'].replace(to_replace='Score',method='ffill')
pbp['score'] = pbp['score'].fillna(method='bfill')

#then we split these to the away team scores and home team scores
#awayscore,homescore = score
#print([len(x.split('-')) for x in pbp['score']])
score = [x.split('-') for x in pbp['score']]
awayscore,homescore = np.transpose(np.array(score))
awayscore = [int(x) for x in awayscore]
homescore = [int(x) for x in homescore]
pbp['awayscore'] = awayscore
pbp['homescore'] = homescore

#now drop the redundant variables
pbp2 = pbp.copy()
pbp3 = pbp2.drop(['awayevents','awaypts','score','homepts','homeevents'], axis=1)
pbp3

Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3
5,11:14.0,Philadelphia,LA Clippers,J. Embiid misses 2-pt shot from 14 ft,1,0,3
6,11:12.0,Philadelphia,LA Clippers,Defensive rebound by B. Griffin,0,0,3
7,11:03.0,Philadelphia,LA Clippers,A. Rivers misses 3-pt shot from 24 ft,0,0,3
8,11:00.0,Philadelphia,LA Clippers,Defensive rebound by J. Embiid,1,0,3
9,10:57.0,Philadelphia,LA Clippers,Personal foul by S. Thornwell (drawn by B. Sim...,0,0,3


In [250]:
# column for when awayteam calls timeout
away = pbp3.awayteam[1]
atyp = away + " full timeout"
pbp3["away_timeout"] = [1 if ele  == atyp else 0 for ele in pbp3["event"]]
#pbp3[pbp3.away_timeout==1]

# column for when hometeam calls timeout
home = pbp3.hometeam[1]
htyp = home + " full timeout"
pbp3["home_timeout"] = [1 if ele  == htyp else 0 for ele in pbp3["event"]]

# 2nd quarter starts on line 120
pbp3.iloc[118]

time                   2nd Q
awayteam        Philadelphia
hometeam         LA Clippers
event                    NaN
isawayevent                0
awayscore                 36
homescore                 30
away_timeout               0
home_timeout               0
Name: 118, dtype: object

In [251]:
# removing rows where events is NaN
pbp4 = pbp3[pd.notnull(pbp3['event'])]
pbp4.iloc[118]

time                    Time
awayteam        Philadelphia
hometeam         LA Clippers
event           Philadelphia
isawayevent                1
awayscore                 36
homescore                 30
away_timeout               0
home_timeout               0
Name: 119, dtype: object

In [252]:
# setting 1 if rebound included else 0
reb = []
for i in pbp4['event']:
    if 'rebound' in i:
        reb.append(1)
    else: 
        reb.append(0)
pbp4['is_rebound'] = reb
pbp4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore,away_timeout,home_timeout,is_rebound
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0,0,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0,0,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0,0,0,0
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0,0,0,1
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3,0,0,0


In [253]:
# finding player that did the offensive rebound
vals = []
off_reb = []
for i in pbp4['event']:
    if 'Offensive rebound' in i:
        vals.append(i)
        uff = i.split('by', 1)[1]
        off_reb.append(uff)
    else: 
        off_reb.append("")
pbp4['offensive_rebound'] = off_reb
pbp4.head()

# finding if away team did the offensive rebound
pbp['awayevents']
pbbp = pbp[pd.notnull(pbp['awayevents'])]
away_vals = pbbp['awayevents'].values

away_match = []
for i in pbp4['event']:
    if 'Offensive rebound' in i:
        if i in away_vals: 
            away_match.append(1)
        else: 
            away_match.append(0)
    else: 
        away_match.append(0)

pbp4['away_offensive_rebound'] = away_match

# finding if home team did the offensive rebound
pbp['homeevents']
pbbbp = pbp[pd.notnull(pbp['homeevents'])]
home_vals = pbbbp['homeevents'].values

home_match = []
for i in pbp4['event']:
    if 'Offensive rebound' in i:
        if i in home_vals: 
            home_match.append(1)
        else: 
            home_match.append(0)
    else: 
        home_match.append(0)

pbp4['home_offensive_rebound'] = home_match
pbp4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore,away_timeout,home_timeout,is_rebound,offensive_rebound,away_offensive_rebound,home_offensive_rebound
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0,0,0,0,,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0,0,0,0,,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0,0,0,0,,0,0
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0,0,0,1,,0,0
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3,0,0,0,,0,0


In [254]:
st = "J. Redick misses 3-pt shot from 25 ft"
ai = st.split('-pt', 1)[0]
points = ai.split('misses ',1)[1]
ai1 = st.split('from ', 1)[1]
feet = ai1.split(' ft', 1)[0]
player = st.split(' misses', 1)[0]

In [255]:
# miss shot events
miss_points = []
miss_feet = []
miss_player = []
for i in pbp4['event']:
    if 'misses' in i:
        st = "J. Redick misses 3-pt shot from 25 ft"
        ai = st.split('-pt', 1)[0]
        points = ai.split('misses ',1)[1]
        ai1 = st.split('from ', 1)[1]
        feet = ai1.split(' ft', 1)[0]
        player = st.split(' misses', 1)[0]
        miss_points.append(points)
        miss_feet.append(feet)
        miss_player.append(player)
    else: 
        miss_points.append(0)
        miss_feet.append(0)
        miss_player.append(0)

pbp4['miss_points'] = miss_points
pbp4['miss_feet'] = miss_feet
pbp4['miss_player'] = miss_player
pbp4.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore,away_timeout,home_timeout,is_rebound,offensive_rebound,away_offensive_rebound,home_offensive_rebound,miss_points,miss_feet,miss_player
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0,0,0,0,,0,0,0,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0,0,0,0,,0,0,0,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0,0,0,0,,0,0,3,25,J. Redick
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0,0,0,1,,0,0,0,0,0
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3,0,0,0,,0,0,0,0,0


In [213]:
# finding player that did the defensive rebound
def_reb = []
for i in pbp4['event']:
    if 'Defensive rebound' in i:
        uf = i.split('by', 1)[1]
        def_reb.append(uf)
    else: 
        def_reb.append("")
pbp4['defensive_rebound'] = def_reb

# finding if away team did the defensive rebound
away_match1 = []
for i in pbp4['event']:
    if 'Defensive rebound' in i:
        if i in away_vals: 
            away_match1.append(1)
        else: 
            away_match1.append(0)
    else: 
        away_match1.append(0)

pbp4['away_defensive_rebound'] = away_match1

# finding if home team did the defensive rebound
home_match1 = []
for i in pbp4['event']:
    if 'Defensive rebound' in i:
        if i in home_vals: 
            home_match1.append(1)
        else: 
            home_match1.append(0)
    else: 
        home_match1.append(0)

pbp4['home_defensive_rebound'] = home_match1
pbp4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore,away_timeout,home_timeout,is_rebound,offensive_rebound,away_offensive_rebound,home_offensive_rebound,defensive_rebound,away_defensive_rebound,home_defensive_rebound,miss_3_25,away_miss_3_25,home_miss_3_25
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0,0,0,0,,0,0,,0,0,,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0,0,0,0,,0,0,,0,0,,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0,0,0,0,,0,0,,0,0,J. Redick,1,0
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0,0,0,1,,0,0,D. Jordan,0,1,,0,0
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3,0,0,0,,0,0,,0,0,,0,0


In [215]:
# finding players that missed 3-pt shot from 25 ft
p325 = []
for i in pbp4['event']:
    if 'misses 3-pt shot from 25 ft' in i:
        pp = i.split(' misses', 1)[0]
        p325.append(pp)
    else: 
        p325.append("")
pbp4['miss_3_25'] = p325

# finding if away players missed 3-pt shot from 25 ft
away_match2 = []
for i in pbp4['event']:
    if 'misses 3-pt shot from 25 ft' in i:
        if i in away_vals: 
            away_match2.append(1)
        else: 
            away_match2.append(0)
    else: 
        away_match2.append(0)
pbp4['away_miss_3_25'] = away_match2

# finding if home players missed 3-pt shot from 25 ft
home_match2 = []
for i in pbp4['event']:
    if 'misses 3-pt shot from 25 ft' in i:
        if i in home_vals: 
            home_match2.append(1)
        else: 
            home_match2.append(0)
    else: 
        home
        home_match2.append(0)
pbp4['home_miss_3_25'] = home_match2        
pbp4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,time,awayteam,hometeam,event,isawayevent,awayscore,homescore,away_timeout,home_timeout,is_rebound,offensive_rebound,away_offensive_rebound,home_offensive_rebound,defensive_rebound,away_defensive_rebound,home_defensive_rebound,miss_3_25,away_miss_3_25,home_miss_3_25
0,12:00.0,Philadelphia,LA Clippers,Start of 1st quarter,1,0,0,0,0,0,,0,0,,0,0,,0,0
1,12:00.0,Philadelphia,LA Clippers,Jump ball: D. Jordan vs. J. Embiid (D. Saric g...,1,0,0,0,0,0,,0,0,,0,0,,0,0
2,11:44.0,Philadelphia,LA Clippers,J. Redick misses 3-pt shot from 25 ft,1,0,0,0,0,0,,0,0,,0,0,J. Redick,1,0
3,11:41.0,Philadelphia,LA Clippers,Defensive rebound by D. Jordan,0,0,0,0,0,1,,0,0,D. Jordan,0,1,,0,0
4,11:33.0,Philadelphia,LA Clippers,A. Rivers makes 3-pt shot from 27 ft (assist b...,0,0,3,0,0,0,,0,0,,0,0,,0,0


In [221]:
# finding players that missed 2-pt shot from 14 ft
p214 = []
for i in pbp4['event']:
    if 'misses 2-pt shot from 14 ft' in i:
        ppp = i.split(' misses', 1)[0]
        p214.append(ppp)
    else: 
        p214.append("")
pbp4['miss_2_14'] = p214

# finding if away players missed 2-pt shot from 14 ft
away_match3 = []
for i in pbp4['event']:
    if 'misses 2-pt shot from 14 ft' in i:
        if i in away_vals: 
            away_match3.append(1)
        else: 
            away_match3.append(0)
    else: 
        away_match3.append(0)
pbp4['away_miss_2_14'] = away_match3

# finding if home players missed 2-pt shot from 14 ft
home_match3 = []
for i in pbp4['event']:
    if 'misses 2-pt shot from 14 ft' in i:
        if i in home_vals: 
            home_match3.append(1)
        else: 
            home_match3.append(0)
    else: 
        home
        home_match3.append(0)
pbp4['home_miss_2_14'] = home_match3      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [190]:
#should create lots of variables, like:
#quarter (how to treat overtime?)
#isassist
#assistplayer
#shooter
#is3ptshot...

In [5]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.basketball-reference.com/leagues/NBA_2018_games-october.html'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'lxml')    
links = []
for ref in soup.find_all('a'):
    link = ref.get('href')
    if link.startswith('/boxscores/2'):
        links.append(link)
print(len(links))
links[0:2]

urls = []
for link in links:
    urls.append('https://www.basketball-reference.com' + link)
print(urls[0:2])

104
['https://www.basketball-reference.com/boxscores/201710170CLE.html', 'https://www.basketball-reference.com/boxscores/201710170GSW.html']
