In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
import statsmodels.api as sm
from scipy import stats
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
from datetime import datetime, timedelta
%matplotlib inline

In [2]:
headings = ['Reserves',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 '+/-']

In [3]:
#Function to gather the basic data for any team.
#Returns data in a dataframe object


def gatherTeamData(team_abr):
    #First, their schedule
    source = requests.get("https://www.basketball-reference.com/teams/" + team_abr + "/2020_games.html").text
    soup = BeautifulSoup(source,'lxml')
    #Gets links to all their games
    table = soup.find("table").find("tbody")
    list_of_links = []
    for row in table.find_all("tr"): #find every tabel row
        for data_point in row.find_all("td"): #for every tabel row, find every data point
            if (data_point["data-stat"] == "box_score_text"):
                link = "https://www.basketball-reference.com/"
                link = link + data_point.a.get("href")
                list_of_links.append(link)
    #Extracting the Dates from the links
    list_of_dates = list(link[48:56] for link in list_of_links)
    format_list_of_dates = list()
    datetime_list = list()
    for single_date in list_of_dates:
        year = single_date[0:4]
        month = single_date[4:6]
        day = single_date[6:]
        format_date = year + "-" + month + "-" + day
        format_list_of_dates.append(format_date)
        datetime_list.append(datetime.strptime(format_date,"%Y-%m-%d"))
    #Keeping only the dates that haven't happend yet
    yesterday = datetime.now() - timedelta(days=3) 
    trimmed_date_list = list()
    for date in datetime_list:
        if date < yesterday:
            trimmed_date_list.append(date)
    #Trimming our list of links down to only links to games that have happend
    trimmed_list_of_links = list()
    trimmed_list_of_links = list_of_links[:len(trimmed_date_list)]
    #Translating datatime objects back into Strings
    trimmed_string_dates = list()
    for date in trimmed_date_list:
        trimmed_string_dates.append(date.strftime("%m-%d-%Y"))
    #Gathering basic stats from the links 
    team_stats = []
    for link in trimmed_list_of_links:
        source = requests.get(link).text
        soup = BeautifulSoup(source,'lxml')
        team_stats_html = soup.find("table", id="box-"+ team_abr + "-game-basic").find("tfoot").find("tr")
        iteration_data = []
        for data_entry in team_stats_html.find_all("td"):
            iteration_data.append(data_entry.text)
        team_stats.append(iteration_data)
    #Creating DataFrame Obj
    team_stats_df = pd.DataFrame(team_stats, index = trimmed_string_dates,
                                   columns = headings[1:] )
    return team_stats_df



In [4]:
#Gathering the teams opponents

def gatherOpponentData(team_abr):
    #First, their schedule
    source = requests.get("https://www.basketball-reference.com/teams/" + team_abr + "/2020_games.html").text
    soup = BeautifulSoup(source,'lxml')
    #Gets links to all their games
    table = soup.find("table").find("tbody")
    list_of_links = []
    for row in table.find_all("tr"): #find every tabel row
        for data_point in row.find_all("td"): #for every tabel row, find every data point
            if (data_point["data-stat"] == "box_score_text"):
                link = "https://www.basketball-reference.com/"
                link = link + data_point.a.get("href")
                list_of_links.append(link)
    #Extracting the Dates from the links
    list_of_dates = list(link[48:56] for link in list_of_links)
    format_list_of_dates = list()
    datetime_list = list()
    for single_date in list_of_dates:
        year = single_date[0:4]
        month = single_date[4:6]
        day = single_date[6:]
        format_date = year + "-" + month + "-" + day
        format_list_of_dates.append(format_date)
        datetime_list.append(datetime.strptime(format_date,"%Y-%m-%d"))
    #Keeping only the dates that haven't happend yet
    yesterday = datetime.now() - timedelta(days=3) 
    trimmed_date_list = list()
    for date in datetime_list:
        if date < yesterday:
            trimmed_date_list.append(date)
    #Translating datatime objects back into Strings
    trimmed_string_dates = list()
    for date in trimmed_date_list:
        trimmed_string_dates.append(date.strftime("%m-%d-%Y"))
    #Trimming our list of links down to only links to games that have happend
    trimmed_list_of_links = list()
    trimmed_list_of_links = list_of_links[:len(trimmed_string_dates)]
    #Gathering basic stats from the links
    opponent_stats = []
    for link in trimmed_list_of_links:
        source = requests.get(link).text
        soup = BeautifulSoup(source,'lxml')
        for div in soup.find_all("div", class_="overthrow table_container"):
            if(div["id"].find(team_abr) == -1):
                if(div["id"].find("q1") == -1 and div["id"].find("q2") == -1 and div["id"].find("h1") == -1
                  and div["id"].find("q3") == -1 and div["id"].find("q4") == -1 and div["id"].find("h2") == -1
                  and div["id"].find("advanced") == -1):
                    table_foot = div.find("table").find("tfoot")
                    iteration_data = []
                    for data_point in table_foot.find_all("td"):
                        iteration_data.append(data_point.text)
                    opponent_stats.append(iteration_data)
    opponent_stats_df = pd.DataFrame(opponent_stats[:len(trimmed_string_dates)], index = trimmed_string_dates, 
                                      columns = headings[1:])
    return opponent_stats_df

In [5]:
#Analyze the data from constructed dataframes

def analyzeData(t1_df,t1_opp_df,t2_df,t2_opp_df):
    #Defining variables
    t1_x = t1_df[['FGA','FG%','3PA','3P%','FT%','STL','PF']]
    t2_x = t2_df[['FGA','FG%','3PA','3P%','FT%','STL','PF']]
    t1_y = t1_df["PTS"].values
    t2_y = t2_df["PTS"].values
    t1_opp_stats = t1_opp_df[['FGA','FG%','3PA','3P%','FT%','STL','PF']]
    t2_opp_stats = t2_opp_df[['FGA','FG%','3PA','3P%','FT%','STL','PF']] 
    #Core inputs
    t1_core_input = (t1_x.mean() + t2_opp_stats.mean())/2 
    t2_core_input = (t2_x.mean() + t1_opp_stats.mean())/2
     #Translating string df to int df
    core_stats = ['FGA','FG%','3PA','3P%','FT%','STL','PF']
    t1_avg = []
    t1_opp_avg = []
    t2_avg = []
    t2_opp_avg = []
    t1_avg.append
    for stat in core_stats:
        t1_avg.append(pd.to_numeric(t1_x[stat]).mean())
        t1_opp_avg.append(pd.to_numeric(t1_opp_stats[stat]).mean())
        t2_avg.append(pd.to_numeric(t2_x[stat]).mean())
        t2_opp_avg.append(pd.to_numeric(t2_opp_stats[stat]).mean())
    #Translating string core input into int core input
    t1_core_input = []
    t2_core_input = []
    c = 0
    for stat in core_stats:
        t1_core_input.append((t1_avg[c] + t2_opp_avg[c])/2)
        t2_core_input.append((t2_avg[c] + t1_opp_avg[c])/2)
        c = c + 1
    #Translatng string x and y into int x and y
    t1_x_num = []
    t2_x_num = []
    for stat in core_stats:
        t1_x_num.append(pd.to_numeric(t1_x[stat]))
        t2_x_num.append(pd.to_numeric(t2_x[stat]))
    t1_y_num = pd.to_numeric(t1_y)
    t2_y_num = pd.to_numeric(t2_y)
    t1_x_num = np.asarray(t1_x_num).transpose()
    t2_x_num = np.asarray(t2_x_num).transpose()
    #Creating the models
    t1_x_train, t1_x_test, t1_y_train, t1_y_test = train_test_split(t1_x_num, t1_y_num, test_size = 0.2, random_state = 0)
    t1_reg = LinearRegression()
    t1_reg.fit(t1_x_train,t1_y_train)
    t2_x_train, t2_x_test, t2_y_train, t2_y_test = train_test_split(t2_x_num, t2_y_num, test_size = 0.2, random_state = 0)
    t2_reg = LinearRegression()
    t2_reg.fit(t2_x_train,t2_y_train)
    #Creating predictions
    t1_pred = t1_reg.predict(np.asarray(t1_core_input).reshape(1,-1))[0]
    t2_pred = t2_reg.predict(np.asarray(t2_core_input).reshape(1,-1))[0]
    #Calculating spread
    spread = t1_pred - t2_pred
    return t1_pred, t2_pred, spread

In [6]:
#A function to call our previously defined functions 

def gatherAndAnalyze(team_1_abr,team_2_abr):
    team_1_df = gatherTeamData(team_1_abr)
    team_2_df = gatherTeamData(team_2_abr)
    team_1_opp_df = gatherOpponentData(team_1_abr)
    team_2_opp_df = gatherOpponentData(team_2_abr)
    t1_pred,t2_pred,spread = analyzeData(team_1_df,team_1_opp_df,team_2_df,team_2_opp_df)
    print(team_1_abr, " will score ", round(t1_pred,2))
    print(team_2_abr, " will score ", round(t2_pred,2))
    print("Resulting in a ", round(spread,2), "spread \n")

In [7]:
# A function to automate the calling of 'gatherAndAnalyze'
# This function will predict all the NBA games played on current day

def predictGamesToday():
    
    #Map of numerical to abbreviated months
    months = {
    1: "january",
    2: "february",
    3: "march",
    4: "april",
    5: "may",
    6: "june",
    7: "july",
    8: "august",
    9: "september",
    10: "october",
    11: "november",
    12: "december"
    }
    #Assembling the link to today's games
    currentMonth = months[datetime.now().month]
    month = datetime.now().month
    linkToThisMonthsGames = "https://www.basketball-reference.com/leagues/NBA_" + str(datetime.now().year) + "_games-" + currentMonth + ".html"
    #Getting the text from the link
    source = requests.get('https://www.basketball-reference.com/leagues/NBA_2020_games-march.html').text
    soup = BeautifulSoup(source,'lxml')
    #Getting the list of teams playing today
    listGames = []
    listTeams = []
    todayStr = currentMonth[0].upper() + currentMonth[1:3]+ datetime.now().strftime(" %#d, %y") + "20"
    for row in soup.tbody.find_all("tr"):
        if(row.a.text.find(todayStr) != -1):
            for data_point in row.find_all("td"):
                listTeams.append(str(data_point.find("a")))
    listTeams[:] = (value for value in listTeams if value != 'None')
    slicer = lambda x: x[16:19]  #[16:19] in link contains team abbrev.
    listTeams = list(map(slicer,listTeams))
    #Calling gatherAndAnalyze for all the teams playing today
    for x in range(0,len(listTeams),2):
        gameList = []
        team_1_abr = listTeams[x]
        team_2_abr = listTeams[x+1]
        gatherAndAnalyze(team_1_abr, team_2_abr)
        

In [8]:
predictGamesToday()

DET  will score  106.08
PHI  will score  110.73
Resulting in a  -4.65 spread 

NYK  will score  111.64
ATL  will score  111.56
Resulting in a  0.07 spread 

CHO  will score  104.49
MIA  will score  114.08
Resulting in a  -9.58 spread 

DEN  will score  110.93
DAL  will score  113.7
Resulting in a  -2.78 spread 

UTA  will score  112.18
OKC  will score  110.66
Resulting in a  1.52 spread 

NOP  will score  114.82
SAC  will score  110.71
Resulting in a  4.1 spread 

