In [151]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup, Comment
import re
import time
import pandasql as psql

In [152]:
# functions
def get_drive_table(team, soup):
    if team == 'home':
        ids = ['all_home_drives', 'home_drives']
    if team == 'vis':
        ids = ['all_vis_drives', 'vis_drives']
    drive_data = []
    parent = soup.find('div', {'id': ids[0]})
    team = parent.find('h2').get_text(strip=False)
    team = team.split()
    team = team[0]
    print(team)
    if parent:
        # print('found parent')
        div = parent.find(string=lambda text: isinstance(text, Comment))
        if div:
            comment_soup = BeautifulSoup(str(div), 'html.parser')
            table = comment_soup.find('table', {'id': ids[1]})
            if table:
                # print('found table')
                cols = table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                drives = table.find('tbody').find_all('tr')
                for drive in drives:
                    columns = drive.find_all(['th', 'td'])
                    row_data = [column.get_text(strip=False) for column in columns]
                    drive_data.append(row_data)
    drives_df = pd.DataFrame(columns = column_headers, data = drive_data)
    drives_df['team'] = team
    return drives_df


def scrape_pbp(game_page_soup):
    pbp_datas = []
    pbp_parent = game_page_soup.find('div', {'class': 'table_wrapper', 'id': 'all_pbp'})
    if pbp_parent:
        pbp_div = pbp_parent.find(string=lambda text: isinstance(text, Comment))
        if pbp_div:
            comment_soup = BeautifulSoup(str(pbp_div), 'html.parser')
            pbp_table = comment_soup.find('table', {'id': 'pbp'})
            if pbp_table:
                cols = pbp_table.find('thead').find('tr')
                column_headers = [th.get_text(strip=True) for th in cols.find_all('th')]
                plays = pbp_table.find('tbody').find_all('tr',{'class': ''})
                for play in plays:
                    columns = play.find_all(['th', 'td'])
                    play_data = [column.get_text(strip=False) for column in columns]
                    pbp_datas.append(play_data)

    pbp_datas = pd.DataFrame(columns = column_headers, data = pbp_datas)
    return pbp_datas


def elapsed_time(time_list):
    time_list = time_list.tolist()
    elapsed_times = []
    length = len(time_list)
    # print(length)
    for i in range(0,length-1):
        # print('in for loop')
        # print(i)
        time1 = time_list[i]
        # print(time1)
        time2 = time_list[i+1]
        # print(time2)
        # print(f'time1: {time1}, time2: {time2}')
        if time1 > time2:
            elapsed = time1 - time2
            # print(elapsed)
            elapsed_times.append(elapsed)
        elif time1 == time2:
            elapsed = 0
            elapsed_times.append(elapsed)
        else:
            elapsed = (time1) + (15-time2)
            # print(elapsed)
            elapsed_times.append(elapsed)
    elapsed_times.append(time_list[length-1])
    return elapsed_times


def game_time(elapsed_time):
    elapsed_time = elapsed_time.tolist()
    real_game_time = []
    real_game_time.append(0)
    print(real_game_time)
    for i in range(0,len(elapsed_time)-1):
        time = real_game_time[i] + elapsed_time[i]
        real_game_time.append(time)
    # real_game_time = real_game_time[1:len(real_game_time)]
    return real_game_time



def play_type(play_description):
        if 'pass' in play_description.lower() or 'scrambles' in play_description.lower():
            return 'Pass'
        elif 'kicks' in play_description.lower() or 'punts' in play_description.lower():
            return "Special Teams"
        else: 
             return "Run"
        


def determine_possession(play_start, drives):
    
    closest_drive_team = None

    for drive_start, drive_team in zip(drives['drive_start_time'], drives['team']):
            if drive_start <= play_start:
                closest_drive_team = drive_team
            else:
                break

    return closest_drive_team



def yardage_by_play(play_detail, play_type):
    yardage = 0
    
    if play_type in ['Run', 'Pass']:
        # Define a regular expression pattern to find yardage in play_detail
        pattern = r'(-?\d+\.\d+|-?\d+)'

        # Use re.findall to find all yardage values that match the pattern
        yards = re.findall(pattern, play_detail)

        # If we found yardage values, sum them up
        if yards:
            yardage = sum(map(float, yards))

    return yardage



In [153]:
# scraping 1 game
pbp_data = []

game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'
r = requests.get(game_url)
print(f'status: {r.status_code}')

game_page_soup = BeautifulSoup(r.text)

status: 200


In [154]:
# scraping drive data for home and away team
home_drives = get_drive_table('home', game_page_soup)
vis_drives = get_drive_table('vis', game_page_soup)

# getting home and away team variables
home_team = home_drives['team'][0]
vis_team = vis_drives['team'][0]

drives = pd.concat([home_drives, vis_drives], axis=0)
drives['Quarter'] = drives['Quarter'].astype(int)
drives['minute'] = drives['Time'].str.extract(r'([0-9]+):').astype(int)
drives['seconds'] = drives['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
drives['seconds_ratio'] = (drives['seconds'] / 60).astype(float)
drives['Numeric_time'] = drives['minute'] + drives['seconds_ratio']
drives = drives.sort_values(by=['Quarter', 'Numeric_time'], ascending=[True, False]).reset_index()
drives['drive_time'] = elapsed_time(drives['Numeric_time'])
drives['drive_start_time'] = game_time(drives['drive_time'])
drives = drives.drop(columns=['minute', 'seconds', 'seconds_ratio', 'index', '#', 'Numeric_time', 'drive_time'])

Chiefs
Lions
[0]


In [155]:
# scraping pbp data
pbp_data = scrape_pbp(game_page_soup)

# Setting up receiving and kicking teams
coin_toss = pbp_data.iloc[0]
pbp_data = pbp_data.drop(0)
coin_toss = coin_toss[7]
teams = re.findall(r'\b[A-Z][a-zA-Z]*\b', coin_toss)
match = re.search(r"(\w+)\s+to\s+receive\s+the\s+opening\s+kickoff", coin_toss)
if match:
    receiving_team = match.group(1)

# dropping timeouts
pbp_data = pbp_data[pbp_data['Location'].str.strip() != '']
pbp_data = pbp_data.dropna(subset=['Location'])

print(f'{home_team} vs. {vis_team} \n total plays: {len(pbp_data)}')

Chiefs vs. Lions 
 total plays: 132


  coin_toss = coin_toss[7]


In [156]:
# General Cleaning
pbp_data['Quarter'] = pbp_data['Quarter'].astype(int)
pbp_data['field_side'] = pbp_data['Location'].str.extract(r'([A-Z]+)')
pbp_data['yardline'] = pbp_data['Location'].str.extract(r'([0-9]+)')
pbp_data['yardline'] = pbp_data['yardline'].astype(int)
pbp_data['minute'] = pbp_data['Time'].str.extract(r'([0-9]+):').astype(int)
pbp_data['seconds'] = pbp_data['Time'].str.extract(r'[0-9]+:([0-9]+)').astype(int)
pbp_data['seconds_ratio'] = (pbp_data['seconds'] / 60).astype(float)
pbp_data['Numeric_time'] = pbp_data['minute'] + pbp_data['seconds_ratio']
pbp_data['play_time'] = elapsed_time(pbp_data['Numeric_time'])
pbp_data['play_start_time'] = game_time(pbp_data['play_time'])
pbp_data = pbp_data.drop(columns=['minute', 'seconds', 'seconds_ratio','Numeric_time', 'play_time'])

[0]


In [157]:
drives

Unnamed: 0,Quarter,Time,LOS,Plays,Length,Net Yds,Result,team,drive_start_time
0,1,15:00,DET 25,3,0:51,7,Punt,Lions,0.0
1,1,14:09,KAN 13,6,3:19,25,Punt,Chiefs,0.85
2,1,10:50,DET 9,14,8:04,91,Touchdown,Lions,4.166667
3,1,2:46,KAN 25,13,5:56,75,Touchdown,Chiefs,12.233333
4,2,11:50,DET 25,11,6:36,61,Fumble,Lions,18.166667
5,2,5:14,KAN 7,3,1:27,5,Punt,Chiefs,24.766667
6,2,3:47,DET 41,3,1:45,2,Punt,Lions,26.216667
7,2,2:02,KAN 18,6,1:28,82,Touchdown,Chiefs,27.966667
8,2,0:34,DET 25,7,0:34,17,Downs,Lions,29.433333
9,3,15:00,KAN 25,5,2:06,14,Punt,Chiefs,30.0


In [162]:
pbp_data.dtypes

Quarter              int64
Time                object
Down                object
ToGo                object
Location            object
DET                 object
KAN                 object
Detail              object
EPB                 object
EPA                 object
field_side          object
yardline             int64
play_start_time    float64
Play_Type           object
dtype: object

In [159]:
pbp_data['Detail']

1         Harrison Butker kicks off 65 yards, touchback.
2      David Montgomery up the middle for 7 yards (ta...
3                 Jared Goff pass incomplete short right
4      Jared Goff pass incomplete deep right intended...
5      Jack Fox punts 61 yards, returned by Richie Ja...
                             ...                        
135    David Montgomery right guard for 3 yards (tack...
137    David Montgomery up the middle for 2 yards (ta...
139                       Jared Goff kneels for -1 yards
140                       Jared Goff kneels for -1 yards
141                       Jared Goff kneels for -1 yards
Name: Detail, Length: 132, dtype: object

In [160]:
# ISOLATE PLAY TYPE

pbp_data['Play_Type'] = pbp_data['Detail'].apply(play_type)
pbp_data['Play_Type']

1      Special Teams
2                Run
3               Pass
4               Pass
5      Special Teams
           ...      
135              Run
137              Run
139              Run
140              Run
141              Run
Name: Play_Type, Length: 132, dtype: object

In [206]:
team_data = ['MIA', 'BUF', 'NYJ', 'NWE', 'PHI', 'DAL', 'NYG', 'WAS', 'BAL', 'PIT', 'CLE', 'CIN', 'DET', 'MIN', 'GNB', 'CHI', 'JAX', 'IND', 'HOU',
                    'TEN', 'ATL','NOR', 'TAM', 'CAR', 'KAN', 'DEN', 'LVR', 'LAC', 'SFO', 'SEA', 'LAR', 'ARI']
mascot_data = ['Dolphins', 'Bills', 'Jets', 'Patriots', 'Eagles', 'Cowboys', 'Giants', 'Commanders', 'Ravens', 'Steelers', 'Browns', 'Bengals', 'Lions', 'Vikings', 'Packers', 'Bears',
                    'Jaguars', 'Colts', 'Texans', 'Titans', 'Falcons', 'Saints', 'Buccaneers', 'Panthers', 'Chiefs', 'Broncos', 'Raiders', 'Chargers', '49ers', 'Seahawks', 'Rams',
                    'Cardinals']

#teams = pd.DataFrame(teams) 
#len(mascot_data)
teams = pd.DataFrame({'Team' : team_data, 'Mascot' : mascot_data})

In [208]:
def determine_possession(play_start, drives, teams):
    
    closest_drive_team = None
    possession = None

    for drive_start, drive_team in zip(drives['drive_start_time'], drives['team']):
            if drive_start <= play_start:
                closest_drive_team = drive_team
            else:
                break
    for i, mascot in enumerate(teams['Mascot']):
        if closest_drive_team == mascot :
            possession = teams['Team'][i]
            break

    return possession


In [207]:
test_column3 = pbp_data['play_start_time'].apply(lambda play_start: determine_possession1(play_start, drives, teams))
test_column3.tail(15)

124    DET
125    DET
126    DET
127    DET
128    DET
129    KAN
130    KAN
131    KAN
132    KAN
133    KAN
135    DET
137    DET
139    DET
140    DET
141    DET
Name: play_start_time, dtype: object

In [209]:
#Possesion

pbp_data['possession'] = pbp_data['play_start_time'].apply(lambda play_start: determine_possession(play_start, drives, teams))



In [147]:
pbp_data['Yardage'] = pbp_data.apply(lambda row: yardage_by_play(row['Detail'], row['Play_Type']), axis=1)
    

pbp_data['Yardage']
    



1      0.0
2      7.0
3      0.0
4      0.0
5      0.0
      ... 
135    3.0
137    2.0
139   -1.0
140   -1.0
141   -1.0
Name: Yardage, Length: 132, dtype: float64

In [148]:
def yardage_by_play(play_detail, play_type):
    yardage = 0
    
    if play_type in ['Run', 'Pass']:
        # Define a regular expression pattern to find yardage in play_detail
        pattern = r'(-?\d+\.\d+|-?\d+)'

        # Use re.findall to find all yardage values that match the pattern
        yards = re.findall(pattern, play_detail)

        # If we found yardage values, sum them up
        if yards:
            yardage = sum(map(float, yards))

    return yardage

In [149]:
def new_yardage(plays):
    yardage = 0

    play_type = str(row['Play_Type']).strip()  # Convert to string and strip leading/trailing spaces

    if play_type in ['Run', 'Pass']:
        for i in range(1, len(row)):
            current_play = row.iloc[i - 1]
            next_play = row.iloc[i]

            if current_play['possession'] != current_play['field_side']:
                # If both are in the team's side of the field
                yardage += int(next_play['yardline']) - int(current_play['yardline'])
                if next_play['field_side'] != current_play['field_side']:
                    # If the team has the ball on their side for the current play and the next play is on the other
                    yardage += (50 - int(next_play['yardline'])) + 50 - int(current_play['yardline'])
                else: 
                    # If the team has the ball on the other side of the field for both the next play and current play
                    yardage += int(current_play['yardline']) - int(next_play['yardline'])
            else:  
                # If the team has the ball on the same side of the field and it's on their side. 
                yardage += int(next_play['yardline']) - int(current_play['yardline'])

            if next_play['field_side'] == current_play['possession'] and current_play['field_side'] != next_play['field_side']:
                yardage += (50 - int(current_play['yardline'])) + 50 - int(next_play['yardline'])
    
    return yardage

           

In [263]:
play_B_play = pbp_data.copy()
play_B_play['yardline'] = np.where(play_B_play['possession'] != play_B_play['field_side'], 100 - play_B_play['yardline'], play_B_play['yardline'])

play_B_play['yardage'] = calculate_yardage(play_B_play['yardline'], play_B_play['yardline'].shift(-1))

mask_run_pass = play_B_play['Play_Type'].isin(['Run', 'Pass'])

# Apply the condition only to rows where the mask is True
play_B_play.loc[mask_run_pass, 'yardline'] = np.where(
    play_B_play.loc[mask_run_pass, 'possession'] != play_B_play.loc[mask_run_pass, 'field_side'],
    100 - play_B_play.loc[mask_run_pass, 'yardline'],
    play_B_play.loc[mask_run_pass, 'yardline'])
play_B_play

Unnamed: 0,Quarter,Time,Down,ToGo,Location,DET,KAN,Detail,EPB,EPA,field_side,yardline,play_start_time,Play_Type,possession,yardage
1,1,15:00,,,KAN 35,0,0,"Harrison Butker kicks off 65 yards, touchback.",0.000,0.610,KAN,65,0.000000,Special Teams,DET,-40.0
2,1,15:00,1,10,DET 25,0,0,David Montgomery up the middle for 7 yards (ta...,0.610,1.010,DET,25,0.000000,Run,DET,7.0
3,1,14:29,2,3,DET 32,0,0,Jared Goff pass incomplete short right,1.010,0.300,DET,32,0.516667,Pass,DET,0.0
4,1,14:25,3,3,DET 32,0,0,Jared Goff pass incomplete deep right intended...,0.300,-1.240,DET,32,0.583333,Pass,DET,0.0
5,1,14:19,4,3,DET 32,0,0,"Jack Fox punts 61 yards, returned by Richie Ja...",-1.240,0.320,DET,32,0.683333,Special Teams,DET,-14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,4,1:53,2,5,KAN 25,21,20,David Montgomery right guard for 3 yards (tack...,3.710,3.400,KAN,25,58.116667,Run,DET,3.0
137,4,1:47,3,2,KAN 22,21,20,David Montgomery up the middle for 2 yards (ta...,3.400,4.240,KAN,22,58.216667,Run,DET,2.0
139,4,1:42,1,10,KAN 20,21,20,Jared Goff kneels for -1 yards,4.240,3.560,KAN,20,58.300000,Run,DET,-1.0
140,4,1:05,2,11,KAN 21,21,20,Jared Goff kneels for -1 yards,3.560,2.740,KAN,21,58.916667,Run,DET,-1.0


In [285]:
def calculate_yardage(current_yardline, next_yardline):
    return next_yardline - current_yardline

def yards_gained1(plays):
    plays = pd.DataFrame(plays).reset_index(drop = True)
    
    # If the team with the ball is on the opposite side, subtract the yardline from 100
    plays['yardline'] = np.where(plays['possession'] != plays['field_side'], 100 - plays['yardline'], plays['yardline'])

    yardage_gained_list = []

    for index, row in plays.iterrows():
        # Check if the play type is 'Run' or 'Pass'
        if row['Play_Type'] in ['Run', 'Pass']:
            if row['possession'] != plays['possession'].shift(-1).iloc[index]:
                # Check if the current and next plays are on the same side of the field
                if row['field_side'] == plays['field_side'].shift(-1).iloc[index]:
                    # Update current play's yardline to 100 - yardline
                    yardline = 100 - row['yardline']
                else:
                        yardline = row['yardline']
            else:
                yardline = row['yardline']

            yardage_gained = calculate_yardage(yardline, plays['yardline'].shift(-1).iloc[index])
        else: 
            yardage_gained = 0.0
        # Append the calculated yardage gained to the list
        yardage_gained_list.append(yardage_gained)
    return yardage_gained_list

test_column2 = yards_gained1(pbp_data)

In [288]:
print(test_column2)

[0.0, 7.0, 0.0, 0.0, 0.0, 3.0, 10.0, 5.0, 0.0, 2.0, 0.0, 0.0, 5.0, 3.0, 1.0, 13.0, 21.0, 4.0, 17.0, 5.0, 6.0, 4.0, 16.0, 2.0, 0.0, 19.0, 4.0, 0.0, 6.0, 5.0, 14.0, 0.0, 16.0, 7.0, 23.0, 9.0, 2.0, 3.0, 21.0, -1.0, 14.0, 2.0, 8.0, -10.0, 10.0, -8.0, 3.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 3.0, -10.0, 34.0, -5.0, 5.0, 4.0, 9.0, -1.0, 0.0, 0.0, -17.0, 2.0, 10.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, -17.0, 7.0, 3.0, 0.0, 6.0, 0.0, 0.0, 18.0, 12.0, 2.0, 5.0, 0.0, 0.0, 9.0, 9.0, 3.0, 41.0, 8.0, 0.0, 8.0, 0.0, -12.0, 20.0, 0.0, 0.0, 24.0, 7.0, 4.0, 9.0, -1.0, 4.0, 5.0, -2.0, -5.0, 18.0, 8.0, 1.0, 3.0, 33.0, 11.0, 6.0, 3.0, -3.0, 0.0, 18.0, 2.0, 3.0, 3.0, 0.0, -10.0, 0.0, 0.0, -5.0, 5.0, 3.0, 2.0, -1.0, -1.0, nan]


In [281]:
pbp_data.head(24)

Unnamed: 0,Quarter,Time,Down,ToGo,Location,DET,KAN,Detail,EPB,EPA,field_side,yardline,play_start_time,Play_Type,possession
1,1,15:00,,,KAN 35,0,0,"Harrison Butker kicks off 65 yards, touchback.",0.0,0.61,KAN,35,0.0,Special Teams,DET
2,1,15:00,1.0,10.0,DET 25,0,0,David Montgomery up the middle for 7 yards (ta...,0.61,1.01,DET,25,0.0,Run,DET
3,1,14:29,2.0,3.0,DET 32,0,0,Jared Goff pass incomplete short right,1.01,0.3,DET,32,0.516667,Pass,DET
4,1,14:25,3.0,3.0,DET 32,0,0,Jared Goff pass incomplete deep right intended...,0.3,-1.24,DET,32,0.583333,Pass,DET
5,1,14:19,4.0,3.0,DET 32,0,0,"Jack Fox punts 61 yards, returned by Richie Ja...",-1.24,0.32,DET,32,0.683333,Special Teams,DET
6,1,13:35,2.0,5.0,KAN 18,0,0,Clyde Edwards-Helaire right end for 3 yards (t...,-0.14,-0.47,KAN,18,1.416667,Run,KAN
7,1,12:57,3.0,2.0,KAN 21,0,0,Patrick Mahomes pass complete short left to Je...,-0.47,1.0,KAN,21,2.05,Pass,KAN
8,1,12:21,1.0,10.0,KAN 31,0,0,Patrick Mahomes pass complete short left to No...,1.0,1.14,KAN,31,2.65,Pass,KAN
9,1,11:44,2.0,5.0,KAN 36,0,0,Patrick Mahomes pass incomplete short right,1.14,0.43,KAN,36,3.266667,Pass,KAN
10,1,11:35,3.0,5.0,KAN 36,0,0,Patrick Mahomes scrambles up the middle for 2 ...,0.43,-0.85,KAN,36,3.416667,Pass,KAN


1       0.0
2     -10.0
3       7.0
4       0.0
5       0.0
       ... 
135    -5.0
137    -3.0
139    -2.0
140     1.0
141     1.0
Name: yardline, Length: 132, dtype: float64

In [203]:
def new_yardage1(play):
    yardage = 0
    
    if play['Play_Type'] in ['Run', 'Pass']:
        yardline_values = play['yardline']

        if isinstance(yardline_values, (list, pd.Series)):
            for i in range(1, len(yardline_values)):
                current_yardline = yardline_values.iloc[i]
                prev_yardline = yardline_values.iloc[i - 1]

                if play['Possession'].iloc[i] != play['field_side'].iloc[i]:
                    # If both are in the team's side of the field
                    if play['field_side'].iloc[i] != play['field_side'].iloc[i - 1]:
                        # If the team has the ball on their side for the current play and the next play is on the other
                        yardage += (50 - int(current_yardline)) + 50 - int(prev_yardline)
                    else:
                        # If the team has the ball on the other side of the field for both the next play and current play
                        yardage += current_yardline - prev_yardline
                else:  
                    # If the team has the ball on the same side of the field and it's on their side.
                    yardage += current_yardline - prev_yardline

                if play['field_side'].iloc[i] == play['Possession'].iloc[i - 1] and play['field_side'].iloc[i] != play['field_side'].iloc[i - 1]:
                    yardage += (50 - int(prev_yardline)) + 50 - int(current_yardline)

    return yardage


In [204]:
test_column = pbp_data.apply(new_yardage1, axis=1)

test_column

1      0
2      0
3      0
4      0
5      0
      ..
135    0
137    0
139    0
140    0
141    0
Length: 132, dtype: int64