# Spread Web Scraper
* Scrapes spreads and games results from https://www.sportsoddshistory.com/
* Brock Ricker
* https://github.com/brock-ricker
* Created 07/01/2022

In [None]:
#import necesarry modules here
import pandas as pd
import numpy as np
import requests
import bs4
from sqlalchemy import create_engine
import time
import os

In [None]:
#Setup variables
db_name = "NFL_spreads"
years = [2017,2018,2019,2020,2021]

In [None]:
#connect to db
engine = create_engine(f"sqlite:///{db_name}.db", echo=False)
#create connection to the engine
conn = engine.connect()

# Data Cleaning Functions

In [None]:
#constructs dataframe from scraped data
def df_construct(row):
    spread = row[6].string.split()[-1]
    spread = spread.replace("PK","0")
    fav_spread = float(spread)
    dog_spread = fav_spread*-1
    fav_result_spread = row[6].string.split()[0]


    over_under = float(row[9].string.split()[-1])

    score=row[5].string
    score = score.replace("(OT)","").strip()

    fav_score = int(score.split()[-1].split("-")[0])
    dog_score = int(score.split()[-1].split("-")[-1])
    fav_result = score.split()[0]



    favorite = pd.DataFrame({
        "day":[row[0].string],
        "date":[row[1].string],
        "time":[row[2].string],
        "team":[row[4].string],
        "opp":[row[8].string],
        "score":fav_score,
        "opp_score":dog_score,
        "result":fav_result,
        "spread":fav_spread,
        "result_spread":fav_result_spread,
        "over_under":over_under
        })

    dog = pd.DataFrame({
        "day":[row[0].string],
        "date":[row[1].string],
        "time":[row[2].string],
        "team":[row[8].string],
        "opp":[row[4].string],
        "score":dog_score,
        "opp_score":fav_score,
        "result":fav_result,
        "spread":dog_spread,
        "result_spread":fav_result_spread,
        "over_under":over_under
        })

    dog["result"].replace({"W":"L","L":"W"},inplace=True)
    dog["result_spread"].replace({"W":"L","L":"W"},inplace=True)

    spread = pd.concat([favorite,dog],ignore_index=True)

    return spread

In [None]:
#creates score prediction from scraped spreads
def score_preds(spreads):
    spreads["predicted_score"] = spreads["over_under"]/2 - spreads["spread"]/2
    spreads["opp_predicted_score"] = spreads["over_under"]/2 + spreads["spread"]/2
    return spreads

In [None]:
#fixes names in scraped data to match data from APIs
def team_name_fixer(df):
    
    #dictionairy to convert team names to match the rest of the server
    team_dict = {'New England Patriots':"NEP",
        'Kansas City Chiefs':"KCC",
        'Buffalo Bills':"BUF",
        'New York Jets':"NYJ",
        'Atlanta Falcons':"ATL",
        'Chicago Bears':"CHI",
        'Cincinnati Bengals':"CIN",
        'Baltimore Ravens':"BAL",
        'Pittsburgh Steelers':"PIT",
        'Cleveland Browns':"CLE",
        'Arizona Cardinals':"ARI",
        'Detroit Lions':"DET",
        'Houston Texans':"HOU",
        'Jacksonville Jaguars':"JAC",
        'Tennessee Titans':"TEN",
        'Oakland Raiders':"OAK",
        'Philadelphia Eagles':"PHI",
        'Washington Redskins':"WAS",
        'Los Angeles Rams':"LAR",
        'Indianapolis Colts':"IND",
        'Green Bay Packers':"GBP",
        'Seattle Seahawks':"SEA",
        'Carolina Panthers':"CAR",
        'San Francisco 49ers':"SFO",
        'Dallas Cowboys':"DAL",
        'New York Giants':"NYG",
        'Minnesota Vikings':"MIN",
        'New Orleans Saints':"NOS",
        'Denver Broncos':"DEN",
        'Los Angeles Chargers':"LAC",
        'Tampa Bay Buccaneers':"TBB",
        'Miami Dolphins':"MIA",
        'Las Vegas Raiders':"LVR",
        'Washington Football Team':"WAS"}

    df.replace(team_dict,inplace=True)

    return df

# Scraping Function

In [None]:
#scrapes website for spreads
#creates the following table: "spreads"
def spread_scraper(years,conn):
    logging.info("----------------------------------------spread_scraper----------------------------------------")
    for year in years:
        spreads = pd.DataFrame()
        logging.info(f"scraping year: {year}")
        #connect to year page
        BS_link = f"https://www.sportsoddshistory.com/nfl-game-season/?y={year}"
        sauce = requests.get(BS_link)
        soup = bs4.BeautifulSoup(sauce.text, 'html.parser')
        target = soup.find("h3", string=f"{year} Regular Season - Week 1")

        week_tables = target.find_next_siblings('table')
        #remove playoffs
        week_tables = week_tables[:-1]

        week = 1
        for week_table in week_tables:
            #find rows in week table
            rows = week_table.find_all("tr")
            #trim header off rows
            rows = rows[1:]
            for entry_row in rows:
                row = entry_row.find_all("td")
                spread = df_construct(row)
                spread["week"] = week
                spread["year"] = year
                spreads = pd.concat([spreads,spread],ignore_index=True)
            week = week+1
        spreads = score_preds(spreads)
        spreads = team_name_fixer(spreads)
        spreads.to_sql("spreads", conn, if_exists="append")

In [None]:
#Call function here
spread_scraper(years,conn)