In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup, Comment
import re
import time
import pandasql as psql

In [2]:
# functions
def get_drive_table(team, soup):
    if team == 'home':
        ids = ['all_home_drives', 'home_drives']
    if team == 'vis':
        ids = ['all_vis_drives', 'vis_drives']
    drive_data = []
    parent = soup.find('div', {'id': ids[0]})
    team = parent.find('h2').get_text(strip=False)
    team = team.split()
    team = team[0]
    print(team)
    if parent:
        # print('found parent')
        div = parent.find(string=lambda text: isinstance(text, Comment))
        if div:
            comment_soup = BeautifulSoup(str(div), 'html.parser')
            table = comment_soup.find('table', {'id': ids[1]})
            if table:
                # print('found table')
                cols = table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                drives = table.find('tbody').find_all('tr')
                for drive in drives:
                    columns = drive.find_all(['th', 'td'])
                    row_data = [column.get_text(strip=False) for column in columns]
                    drive_data.append(row_data)
    drives_df = pd.DataFrame(columns = column_headers, data = drive_data)
    drives_df['team'] = team
    return drives_df


def scrape_pbp(game_page_soup):
    pbp_datas = []
    pbp_parent = game_page_soup.find('div', {'class': 'table_wrapper', 'id': 'all_pbp'})
    if pbp_parent:
        pbp_div = pbp_parent.find(string=lambda text: isinstance(text, Comment))
        if pbp_div:
            comment_soup = BeautifulSoup(str(pbp_div), 'html.parser')
            pbp_table = comment_soup.find('table', {'id': 'pbp'})
            if pbp_table:
                cols = pbp_table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                plays = pbp_table.find('tbody').find_all('tr',{'class': ''})
                for play in plays:
                    columns = play.find_all(['th', 'td'])
                    play_data = [column.get_text(strip=False) for column in columns]
                    pbp_datas.append(play_data)

    pbp_datas = pd.DataFrame(columns = column_headers, data = pbp_datas)
    return pbp_datas


def elapsed_time(time_list):
    time_list = time_list.tolist()
    elapsed_times = []
    length = len(time_list)
    # print(length)
    for i in range(0,length-1):
        # print('in for loop')
        # print(i)
        time1 = time_list[i]
        # print(time1)
        time2 = time_list[i+1]
        # print(time2)
        # print(f'time1: {time1}, time2: {time2}')
        if time1 > time2:
            elapsed = time1 - time2
            # print(elapsed)
            elapsed_times.append(elapsed)
        elif time1 == time2:
            elapsed = 0
            elapsed_times.append(elapsed)
        else:
            elapsed = (time1) + (15-time2)
            # print(elapsed)
            elapsed_times.append(elapsed)
    elapsed_times.append(time_list[length-1])
    return elapsed_times


def game_time(elapsed_time):
    elapsed_time = elapsed_time.tolist()
    real_game_time = []
    real_game_time.append(0)
    print(real_game_time)
    for i in range(0,len(elapsed_time)-1):
        time = real_game_time[i] + elapsed_time[i]
        real_game_time.append(time)
    # real_game_time = real_game_time[1:len(real_game_time)]
    return real_game_time

In [3]:
# scraping 1 game
pbp_data = []

game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'
r = requests.get(game_url)
print(f'status: {r.status_code}')

game_page_soup = BeautifulSoup(r.text)

status: 200


In [4]:
# scraping drive data for home and away team
home_drives = get_drive_table('home', game_page_soup)
vis_drives = get_drive_table('vis', game_page_soup)

# getting home and away team variables
home_team = home_drives['team'][0]
vis_team = vis_drives['team'][0]

drives = pd.concat([home_drives, vis_drives], axis=0)
drives['Quarter'] = drives['Quarter'].astype(int)
drives['minute'] = drives['Time'].str.extract(r'([0-9]+):').astype(int)
drives['seconds'] = drives['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
drives['seconds_ratio'] = (drives['seconds'] / 60).astype(float)
drives['Numeric_time'] = drives['minute'] + drives['seconds_ratio']
drives = drives.sort_values(by=['Quarter', 'Numeric_time'], ascending=[True, False]).reset_index()
drives['drive_time'] = elapsed_time(drives['Numeric_time'])
drives['drive_start_time'] = game_time(drives['drive_time'])
drives = drives.drop(columns=['minute', 'seconds', 'seconds_ratio', 'index', '#', 'Numeric_time', 'drive_time'])

Chiefs
Lions
[0]


In [5]:
# scraping pbp data
pbp_data = scrape_pbp(game_page_soup)

# Setting up receiving and kicking teams
coin_toss = pbp_data.iloc[0]
pbp_data = pbp_data.drop(0)
coin_toss = coin_toss[7]
teams = re.findall(r'\b[A-Z][a-zA-Z]*\b', coin_toss)
match = re.search(r"(\w+)\s+to\s+receive\s+the\s+opening\s+kickoff", coin_toss)
if match:
    receiving_team = match.group(1)

# dropping timeouts
pbp_data = pbp_data[pbp_data['Location'].str.strip() != '']
pbp_data = pbp_data.dropna(subset=['Location'])

print(f'{home_team} vs. {vis_team} \n total plays: {len(pbp_data)}')

Chiefs vs. Lions 
 total plays: 132


In [6]:
# General Cleaning
pbp_data['Quarter'] = pbp_data['Quarter'].astype(int)
pbp_data['field_side'] = pbp_data['Location'].str.extract(r'([A-Z]+)')
pbp_data['yardline'] = pbp_data['Location'].str.extract(r'([0-9]+)')
pbp_data['yardline'] = pbp_data['yardline'].astype(int)
pbp_data['minute'] = pbp_data['Time'].str.extract(r'([0-9]+):').astype(int)
pbp_data['seconds'] = pbp_data['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
pbp_data['seconds_ratio'] = (pbp_data['seconds'] / 60).astype(float)
pbp_data['Numeric_time'] = pbp_data['minute'] + pbp_data['seconds_ratio']
pbp_data['play_time'] = elapsed_time(pbp_data['Numeric_time'])
pbp_data['play_start_time'] = game_time(pbp_data['play_time'])
pbp_data = pbp_data.drop(columns=['minute', 'seconds', 'seconds_ratio','Numeric_time', 'play_time'])

[0]
