In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
matchup = pd.read_csv("matchup.txt", header=None)
matchup = str(matchup[0].values[0])
print(matchup)


BOS vs. LAC


In [3]:
team1 = matchup.split(" ")[0]
team2 = matchup.split(" ")[-1]


# scrape from basketball reference
team1_stats = pd.read_html(f"https://www.basketball-reference.com/teams/{team1}/2023_games.html")
team2_stats = pd.read_html(f"https://www.basketball-reference.com/teams/{team2}/2023_games.html")

team1_stats = team1_stats[0]
team2_stats = team2_stats[0]

In [4]:
team1_stats.dropna(subset={"Tm"}, inplace=True) # null scores haven't happened yet
team1_stats.drop(columns={"Unnamed: 3", "Unnamed: 4", "Unnamed: 8", "Notes"}, inplace=True) # drop columns we don't need
team1_stats = team1_stats[team1_stats["Date"] != "Date"] # remove header row showing up as a game
team1_stats.rename(columns={"Unnamed: 5": "Home", "Unnamed: 7": "Win"}, inplace=True)

# do above steps for team2
team2_stats.dropna(subset={"Tm"}, inplace=True)
team2_stats.drop(columns={"Unnamed: 3", "Unnamed: 4", "Unnamed: 8", "Notes"}, inplace=True)
team2_stats = team2_stats[team2_stats["Date"] != "Date"]
team2_stats.rename(columns={"Unnamed: 5": "Home", "Unnamed: 7": "Win"}, inplace=True)

In [5]:
# function to convert Home column to 0 or 1
def home_to_binary(home):
    if home == "@":
        return 0
    else:
        return 1

# function to convert Win column to 0 or 1
def win_to_binary(win):
    if win == "W":
        return 1
    else:
        return 0

# function to return day of week (first 3 letters) from date
def get_day_of_week(date):
    return date[:3]

team1_stats["Home"] = team1_stats.apply(lambda x: home_to_binary(x["Home"]), axis=1)
team1_stats["Win"] = team1_stats.apply(lambda x: win_to_binary(x["Win"]), axis=1)
team1_stats["Day"] = team1_stats.apply(lambda x: get_day_of_week(x["Date"]), axis=1)

# set data types
team1_stats["Tm"] = team1_stats["Tm"].astype(int)
team1_stats["Opp"] = team1_stats["Opp"].astype(int)
team1_stats["W"] = team1_stats["W"].astype(int)
team1_stats["L"] = team1_stats["L"].astype(int)

# do same for team2
team2_stats["Home"] = team2_stats.apply(lambda x: home_to_binary(x["Home"]), axis=1)
team2_stats["Win"] = team2_stats.apply(lambda x: win_to_binary(x["Win"]), axis=1)
team2_stats["Day"] = team2_stats.apply(lambda x: get_day_of_week(x["Date"]), axis=1)

team2_stats["Tm"] = team2_stats["Tm"].astype(int)
team2_stats["Opp"] = team2_stats["Opp"].astype(int)
team2_stats["W"] = team2_stats["W"].astype(int)
team2_stats["L"] = team2_stats["L"].astype(int)

In [6]:
team1_stats["Date"] = team1_stats["Date"].apply(lambda x: x[4:])
team2_stats["Date"] = team2_stats["Date"].apply(lambda x: x[4:])

team1_stats["Date"] = pd.to_datetime(team1_stats["Date"])
team2_stats["Date"] = pd.to_datetime(team2_stats["Date"])

In [7]:
def day_to_numeric(day):
    if day == "Mon":
        return 1
    if day == "Tue":
        return 2
    if day == "Wed":
        return 3
    if day == "Thu":
        return 4
    if day == "Fri":
        return 5
    if day == "Sat":
        return 6
    if day == "Sun":
        return 7

team1_stats["Day"] = team1_stats.apply(lambda x: day_to_numeric(x["Day"]), axis=1)
team2_stats["Day"] = team2_stats.apply(lambda x: day_to_numeric(x["Day"]), axis=1)

In [8]:
ranks = pd.read_html("https://www.cbssports.com/nba/powerrankings/")
ranks = ranks[0]

ranks["Chg"] = ranks["Chg"].apply(lambda x: 0 if x == "--" else x)
ranks.rename(columns={"Unnamed: 2": "Note"}, inplace=True) # may incorporate sentiment analysis into model

team1_stats["teamname"] = team1_stats["Opponent"].apply(lambda x: x.split(" ")[-1])
team2_stats["teamname"] = team2_stats["Opponent"].apply(lambda x: x.split(" ")[-1])

team1_stats = team1_stats.merge(ranks, how="left", left_on="teamname", right_on="Teams")
team2_stats = team2_stats.merge(ranks, how="left", left_on="teamname", right_on="Teams")



team1_stats = team1_stats[["G", "Home", "W", "L", "Day", "Rk", "Chg", "Tm"]]
team2_stats = team2_stats[["G", "Home", "W", "L", "Day", "Rk", "Chg", "Tm"]]

# impute null values with mean of column
team1_stats.dropna(inplace=True)
team2_stats.dropna(inplace=True)

# export both to csv
team1_stats.to_csv("data/team1_stats.csv", index=False)
team2_stats.to_csv("data/team2_stats.csv", index=False)

# export ranks
ranks.to_csv("data/ranks.csv", index=False)

In [9]:
ranks

Unnamed: 0,Rk,Teams,Note,Chg,Rcrd
0,1,Celtics,"Oh hey, the Celtics lost a game -- in overtime...",0,21-7
1,2,Pelicans,"Even with Brandon Ingram out, the Pelican trai...",2,18-8
2,3,Bucks,The Bucks started the week off with a somewhat...,0,19-7
3,4,Grizzlies,"The Grizzlies won all four games this week, ma...",3,18-9
4,5,Cavaliers,The Cavs went 2-1 this week on the strength of...,1,17-11
5,6,Mavericks,"Don't look now, but here come the Mavs. Blowou...",13,14-13
6,7,Kings,The Kings lit the beam after wins over the Cli...,2,14-11
7,8,Suns,It was a dim week for the normally bright Suns...,6,16-11
8,9,Nuggets,"The Nuggets are grateful for Jamal Murray, who...",4,16-10
9,10,Nets,The Nets beat the Raptors and Hornets this wee...,1,17-12


In [10]:
from datetime import datetime

In [11]:
def get_date(date):
    """truncates day of week out of column, returns datetime object"""

    temp = pd.to_datetime(date[4:])
    temp = str(temp)[:-9]
    return temp
 
    


In [66]:
team2 = "LAL"

In [67]:
team1_stats = pd.read_html(f"https://www.basketball-reference.com/teams/{team1}/2023_games.html")[0]
team2_stats = pd.read_html(f"https://www.basketball-reference.com/teams/{team2}/2023_games.html")[0]

# use get date
team1_stats["Date"] = team1_stats.apply(lambda x: get_date(x["Date"]), axis=1)
team2_stats["Date"] = team2_stats.apply(lambda x: get_date(x["Date"]), axis=1)


In [68]:
team1_stats["Date"] = pd.to_datetime(team1_stats["Date"])
team2_stats["Date"] = pd.to_datetime(team2_stats["Date"])

# drop null subset date
team1_stats.dropna(subset=["Date"], inplace=True)
team2_stats.dropna(subset=["Date"], inplace=True)

In [69]:
from datetime import datetime, timezone
import pytz

def utc_to_local(utc_dt):
    return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)

# apply to both
team1_stats["Date"] = team1_stats.apply(lambda x: utc_to_local(x["Date"]), axis=1)
team2_stats["Date"] = team2_stats.apply(lambda x: utc_to_local(x["Date"]), axis=1)

In [70]:
est = pytz.timezone('US/Eastern')


# localize each date to est
team1_stats["Date"] = team1_stats.apply(lambda x: est.localize(x["Date"]), axis=1)
team2_stats["Date"] = team2_stats.apply(lambda x: est.localize(x["Date"]), axis=1)

In [71]:
from datetime import date
timezone = pytz.timezone('US/Eastern')
now = datetime.now(tz = timezone)
now = now.strftime("%Y %m %d")
team1_stats = team1_stats[team1_stats["Date"] == now]
team2_stats = team2_stats[team2_stats["Date"] == now]

In [72]:
team1_stats

Unnamed: 0,G,Date,Start (ET),Unnamed: 3,Unnamed: 4,Unnamed: 5,Opponent,Unnamed: 7,Unnamed: 8,Tm,Opp,W,L,Streak,Notes
29,29,2022-12-13 00:00:00-05:00,10:00p,,,@,Los Angeles Lakers,,,,,,,,


In [73]:
# merge ranks1 to team1 stats and team2 stats
team1_stats1 = team1_stats.merge(ranks1, how="left", left_on="Opponent", right_on="teamname")
team2_stats1 = team2_stats.merge(ranks1, how="left", left_on="Opponent", right_on="teamname")

In [74]:
team1_stats["opp_teamname"] = team1_stats["Opponent"].apply(lambda x: x.split(" ")[-1])
team2_stats["opp_teamname"] = team2_stats["Opponent"].apply(lambda x: x.split(" ")[-1])

In [75]:
team1_stats = team1_stats.merge(ranks, how="left", left_on="opp_teamname", right_on="Teams")
team2_stats = team2_stats.merge(ranks, how="left", left_on="opp_teamname", right_on="Teams")

In [76]:
# set max cols none
pd.set_option('display.max_columns', None)

In [77]:
team1_stats["Home"] = team1_stats["Unnamed: 5"].apply(lambda x: home_to_binary(x))
team2_stats["Home"] = team2_stats["Unnamed: 5"].apply(lambda x: home_to_binary(x))

In [78]:
team2_stats[["Rk", "Chg", "Home"]]

Unnamed: 0,Rk,Chg,Home
0,1,0,1


In [35]:
from utils import *

get_matchup_to_predict("LAL", "BOS")

KeyError: 'Date'