In [360]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup, Comment
import re
import time
import pandasql as psql

In [361]:
# functions
def get_drive_table(team, soup):
    if team == 'home':
        ids = ['all_home_drives', 'home_drives']
    if team == 'vis':
        ids = ['all_vis_drives', 'vis_drives']
    drive_data = []
    parent = soup.find('div', {'id': ids[0]})
    team = parent.find('h2').get_text(strip=False)
    team = team.split()
    team = team[0]
    print(team)
    if parent:
        # print('found parent')
        div = parent.find(string=lambda text: isinstance(text, Comment))
        if div:
            comment_soup = BeautifulSoup(str(div), 'html.parser')
            table = comment_soup.find('table', {'id': ids[1]})
            if table:
                # print('found table')
                cols = table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                drives = table.find('tbody').find_all('tr')
                for drive in drives:
                    columns = drive.find_all(['th', 'td'])
                    row_data = [column.get_text(strip=False) for column in columns]
                    drive_data.append(row_data)
    drives_df = pd.DataFrame(columns = column_headers, data = drive_data)
    drives_df['team'] = team
    return drives_df


def scrape_pbp(game_page_soup):
    pbp_datas = []
    pbp_parent = game_page_soup.find('div', {'class': 'table_wrapper', 'id': 'all_pbp'})
    if pbp_parent:
        pbp_div = pbp_parent.find(string=lambda text: isinstance(text, Comment))
        if pbp_div:
            comment_soup = BeautifulSoup(str(pbp_div), 'html.parser')
            pbp_table = comment_soup.find('table', {'id': 'pbp'})
            if pbp_table:
                cols = pbp_table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                plays = pbp_table.find('tbody').find_all('tr',{'class': ''})
                for play in plays:
                    columns = play.find_all(['th', 'td'])
                    play_data = [column.get_text(strip=False) for column in columns]
                    pbp_datas.append(play_data)

    pbp_datas = pd.DataFrame(columns = column_headers, data = pbp_datas)
    return pbp_datas


def elapsed_time(time_list):
    elapsed_times = []
    length = len(time_list)
    # print(length)
    for i in range(0,length-1):
        time1 = time_list[i]
        time2 = time_list[i+1]
        # print(f'time1: {time1}, time2: {time2}')
        if time1 > time2:
            elapsed = time1 - time2
            # print(elapsed)
            elapsed_times.append(elapsed)
        else:
            elapsed = (time1) + (15-time2)
            # print(elapsed)
            elapsed_times.append(elapsed)
    elapsed_times.append(time_list[length-1])
    return elapsed_times


def game_time(elapsed_time):
    real_game_time = []
    real_game_time.append(0)
    for i in range(0,len(elapsed_time-1)):
        time = real_game_time[i] + elapsed_time[i]
        # print(time)
        real_game_time.append(time)
    real_game_time = real_game_time[1:len(real_game_time)]
    return real_game_time

In [362]:
# scraping 1 game
pbp_data = []

game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'
r = requests.get(game_url)
print(f'status: {r.status_code}')

game_page_soup = BeautifulSoup(r.text)

status: 200


In [363]:
# scraping drive data for home and away team
home_drives = get_drive_table('home', game_page_soup)
vis_drives = get_drive_table('vis', game_page_soup)

# getting home and away team variables
home_team = home_drives['team'][0]
vis_team = vis_drives['team'][0]

drives = pd.concat([home_drives, vis_drives], axis=0)
drives['Quarter'] = drives['Quarter'].astype(int)
drives['minute'] = drives['Time'].str.extract(r'([0-9]+):').astype(int)
drives['seconds'] = drives['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
drives['seconds_ratio'] = (drives['seconds'] / 60).astype(float)
drives['Numeric_time'] = drives['minute'] + drives['seconds_ratio']
drives = drives.sort_values(by=['Quarter', 'Numeric_time'], ascending=[True, False]).reset_index()
# drives['end_time'] = drives['Numeric_time'].shift(-1)
# drives['end_quarter'] = drives['Quarter'].shift(-1)
# drives['end_time'] = drives['end_time'].fillna(0)
# drives['end_quarter'] = drives['end_quarter'].fillna(4)
drives = drives.drop(columns=['minute', 'seconds', 'seconds_ratio', 'index', '#'])

Chiefs
Lions


In [364]:
drives

Unnamed: 0,Quarter,Time,LOS,Plays,Length,Net Yds,Result,team,Numeric_time
0,1,15:00,DET 25,3,0:51,7,Punt,Lions,15.0
1,1,14:09,KAN 13,6,3:19,25,Punt,Chiefs,14.15
2,1,10:50,DET 9,14,8:04,91,Touchdown,Lions,10.833333
3,1,2:46,KAN 25,13,5:56,75,Touchdown,Chiefs,2.766667
4,2,11:50,DET 25,11,6:36,61,Fumble,Lions,11.833333
5,2,5:14,KAN 7,3,1:27,5,Punt,Chiefs,5.233333
6,2,3:47,DET 41,3,1:45,2,Punt,Lions,3.783333
7,2,2:02,KAN 18,6,1:28,82,Touchdown,Chiefs,2.033333
8,2,0:34,DET 25,7,0:34,17,Downs,Lions,0.566667
9,3,15:00,KAN 25,5,2:06,14,Punt,Chiefs,15.0


In [365]:
# scraping pbp data
pbp_data = scrape_pbp(game_page_soup)

# Setting up receiving and kicking teams
coin_toss = pbp_data.iloc[0]
pbp_data = pbp_data.drop(0)
coin_toss = coin_toss[7]
teams = re.findall(r'\b[A-Z][a-zA-Z]*\b', coin_toss)
match = re.search(r"(\w+)\s+to\s+receive\s+the\s+opening\s+kickoff", coin_toss)
if match:
    receiving_team = match.group(1)

# dropping timeouts
pbp_data = pbp_data[pbp_data['Location'].str.strip() != '']
pbp_data = pbp_data.dropna(subset=['Location'])

print(f'{home_team} vs. {vis_team} \n total plays: {len(pbp_data)}')

Chiefs vs. Lions 
 total plays: 132


In [366]:
# General Cleaning
pbp_data['Quarter'] = pbp_data['Quarter'].astype(int)
pbp_data['field_side'] = pbp_data['Location'].str.extract(r'([A-Z]+)')
pbp_data['yardline'] = pbp_data['Location'].str.extract(r'([0-9]+)')
pbp_data['yardline'] = pbp_data['yardline'].astype(int)
pbp_data['minute'] = pbp_data['Time'].str.extract(r'([0-9]+):').astype(int)
pbp_data['seconds'] = pbp_data['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
pbp_data['seconds_ratio'] = (pbp_data['seconds'] / 60).astype(float)
pbp_data['Numeric_time'] = pbp_data['minute'] + pbp_data['seconds_ratio']
pbp_data = pbp_data.drop(columns=['minute', 'seconds', 'seconds_ratio'])


In [367]:
drives['drive_time'] = elapsed_time(drives['Numeric_time'])
drives['cumul_time'] = game_time(drives['drive_time'])
drives


Unnamed: 0,Quarter,Time,LOS,Plays,Length,Net Yds,Result,team,Numeric_time,drive_time,cumul_time
0,1,15:00,DET 25,3,0:51,7,Punt,Lions,15.0,0.85,0.85
1,1,14:09,KAN 13,6,3:19,25,Punt,Chiefs,14.15,3.316667,4.166667
2,1,10:50,DET 9,14,8:04,91,Touchdown,Lions,10.833333,8.066667,12.233333
3,1,2:46,KAN 25,13,5:56,75,Touchdown,Chiefs,2.766667,5.933333,18.166667
4,2,11:50,DET 25,11,6:36,61,Fumble,Lions,11.833333,6.6,24.766667
5,2,5:14,KAN 7,3,1:27,5,Punt,Chiefs,5.233333,1.45,26.216667
6,2,3:47,DET 41,3,1:45,2,Punt,Lions,3.783333,1.75,27.966667
7,2,2:02,KAN 18,6,1:28,82,Touchdown,Chiefs,2.033333,1.466667,29.433333
8,2,0:34,DET 25,7,0:34,17,Downs,Lions,0.566667,0.566667,30.0
9,3,15:00,KAN 25,5,2:06,14,Punt,Chiefs,15.0,2.1,32.1


In [368]:
game_time(drives['drive_time'])

[0.8499999999999996,
 4.166666666666666,
 12.233333333333333,
 18.166666666666664,
 24.766666666666666,
 26.216666666666665,
 27.966666666666665,
 29.43333333333333,
 29.999999999999996,
 32.099999999999994,
 33.266666666666666,
 34.099999999999994,
 36.39999999999999,
 39.26666666666666,
 42.83333333333333,
 43.89999999999999,
 47.816666666666656,
 52.89999999999999,
 54.883333333333326,
 57.49999999999999,
 57.99999999999999,
 59.99999999999999]