In [195]:
import os
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup

In [196]:
PREDICTIONS_DIR = "data/predictions"
team_abbrev = pd.read_csv("teams.csv")

In [197]:
games = os.listdir(PREDICTIONS_DIR)
games = [os.path.join(PREDICTIONS_DIR, f) for f in games if f.endswith(".html")]

In [198]:
def parse_html(game):
    with open(game) as f:
        html = f.read()

    soup = BeautifulSoup(html)
    
    # Removing headers that would interfere with pandas
    [s.decompose() for s in soup.select("#all_team_leaders")]
    [s.decompose() for s in soup.select("#all_last_matchups")]
    [s.decompose() for s in soup.select("#bottom_nav")]
    [s.decompose() for s in soup.select('[id^="all_"][id$="_injury_report"]')]
    [s.decompose() for s in soup.select("div.media-item.logo")]

    # Test for changes
    # html_snip = soup.prettify()
    # save_path = os.path.join(PREDICTIONS_DIR, "asdf.html")
    # with open(save_path, "w+", encoding="utf-8") as f:
    #     f.write(html_snip)
    
    return soup

In [199]:
def read_team_ranks(soup):
    html_str = str(soup)
    html_io = StringIO(html_str)

    df = pd.read_html(html_io, attrs={"id": "teams_ranks"})[0].T

    th_tags = soup.find_all('th', {'aria-label': True})
    teams = [th['aria-label'] for th in th_tags if th.get('data-stat') != 'name']

    cols = ["Home Record", "Away Record", "Record vs. Spread", "Points Scored/Game", "Points Allowed/Game", "Takeaway Diff.", "Pass Yds/Game", 
            "Rush Yds/Game", "Def. Pass Yds/Game", "Def. Rush Yds/Game", "Tm"]

    df = df.iloc[1:].reset_index(drop=True)

    df["Tm"] = teams
    df.columns = cols

    # Removing record columns
    df = df.drop(columns=["Home Record", "Away Record", "Record vs. Spread"])

    # Remove rankings of teams within league
    df = df.replace(r'\(.*?\)', '', regex=True)

    df = df[['Tm'] + [col for col in df.columns if col != 'Tm']]

    return df

In [228]:
soup = parse_html(games[26])
read_team_ranks(soup)

Unnamed: 0,Tm,Points Scored/Game,Points Allowed/Game,Takeaway Diff.,Pass Yds/Game,Rush Yds/Game,Def. Pass Yds/Game,Def. Rush Yds/Game
0,LVR,17.5,26.9,-19,221,78,208,121
1,NOR,22.1,22.3,1,210,124,242,134


In [261]:
future_games = []

for game in games:
    soup = parse_html(game)
    
    team_ranks = read_team_ranks(soup)
    teams = list(team_ranks["Tm"])
    
    summaries = []
    for team in teams:
        summary = team_ranks[team_ranks["Tm"] == team]
        
        summaries.append(summary)

    game_summary = pd.concat(summaries)

    future_game = pd.concat([game_summary])
    future_game["home"] = [0, 1]
    game_opp = future_game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([future_game, game_opp], axis=1)

    full_game["season"] = "2024"  # If available
    full_game["date"] = os.path.basename(game)[:8]  # Assuming the date is part of the filename
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")

    future_games.append(full_game)

    # Print progress every 10 games
    if len(future_games) % 10 == 0:
        print(f"{len(future_games)} / {len(games)}")

10 / 48
20 / 48
30 / 48
40 / 48


In [262]:
future_games

[    Tm Points Scored/Game Points Allowed/Game Takeaway Diff. Pass Yds/Game  \
 0  DEN              24.0                17.6             +5           202    
 1  LAC              21.0                17.6            +11           196    
 
   Rush Yds/Game Def. Pass Yds/Game Def. Rush Yds/Game  home  index_opp  ...  \
 0          108                216                 98      0          1  ...   
 1          105                211                125      1          0  ...   
 
   Points Scored/Game_opp Points Allowed/Game_opp Takeaway Diff._opp  \
 0                  21.0                    17.6                +11    
 1                  24.0                    17.6                 +5    
 
   Pass Yds/Game_opp Rush Yds/Game_opp Def. Pass Yds/Game_opp  \
 0              196               105                    211    
 1              202               108                    216    
 
   Def. Rush Yds/Game_opp home_opp  season       date  
 0                   125         1    2024 2024-

In [263]:
future_games_df = pd.concat(future_games, ignore_index=True)

In [264]:
future_games_df

Unnamed: 0,Tm,Points Scored/Game,Points Allowed/Game,Takeaway Diff.,Pass Yds/Game,Rush Yds/Game,Def. Pass Yds/Game,Def. Rush Yds/Game,home,index_opp,...,Points Scored/Game_opp,Points Allowed/Game_opp,Takeaway Diff._opp,Pass Yds/Game_opp,Rush Yds/Game_opp,Def. Pass Yds/Game_opp,Def. Rush Yds/Game_opp,home_opp,season,date
0,DEN,24.0,17.6,+5,202,108,216,98,0,1,...,21.0,17.6,+11,196,105,211,125,1,2024,2024-12-19
1,LAC,21.0,17.6,+11,196,105,211,125,1,0,...,24.0,17.6,+5,202,108,216,98,0,2024,2024-12-19
2,HOU,23.4,21.4,+13,212,111,196,105,0,1,...,23.5,18.5,+2,224,112,215,91,1,2024,2024-12-21
3,KAN,23.5,18.5,+2,224,112,215,91,1,0,...,23.4,21.4,+13,212,111,196,105,0,2024,2024-12-21
4,PIT,24.0,18.9,+18,198,126,222,94,0,1,...,29.9,23.7,+2,245,178,258,80,1,2024,2024-12-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,LAR,22.1,24.1,+4,232,105,218,135,1,0,...,22.5,22.4,-5,240,94,213,127,0,2024,2025-01-05
92,CLE,17.1,25.4,-16,217,92,212,128,0,1,...,29.9,23.7,+2,245,178,258,80,1,2024,2025-01-05
93,BAL,29.9,23.7,+2,245,178,258,80,1,0,...,17.1,25.4,-16,217,92,212,128,0,2024,2025-01-05
94,NOR,22.1,22.3,+1,210,124,242,134,0,1,...,28.8,23.3,-2,244,144,247,109,1,2024,2025-01-05


In [266]:
future_games_df.to_csv("2024_nfl_prediction_games.csv")