# Part 1 - Libraries

In [1]:
#pip install yfinance
#pip install matplotlib pendulum
# pip.exe install selenium in anaconda terminal

import time
start_time = time.time() # Start time of script

# Pulling Data
import yfinance as yf
import pandas as pd
import pendulum
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import datetime
from urllib.request import urlopen
import re
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.select import Select
from selenium.webdriver.common import keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from time import sleep
import string
import unidecode

headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})

chrome_options = Options()
chrome_options.add_argument("--headless")  # servers don't provide the visulazation
chrome_options.add_argument("--no-sandbox")  # operate at the highest authority
chrome_options.add_argument(
    "--disable-dev-shm-usage"
)  # increase the RAM of chrome to load the page


path = "chromedriver.exe"

# Math
import math
import numpy as np
import warnings

# Data Management
import gc

# Wrapping
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

# Modeling
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

# For business days
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
US_BUSINESS_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar())


warnings.filterwarnings("ignore")

# Part 2 - Get Data (ESPN)

## Part 2.1 - Standings

In [2]:
abbreviations = []
records = []
years = []

abbreviation_dict = {
 'SA': 'SAS',
 'DAL': 'DAL',
 'SAC': 'SAC',
 'MIN': 'MIN',
 'DET': 'DET',
 'LAL': 'LAL',
 'POR': 'POR',
 'NJ': 'BKN',
 'IND': 'IND',
 'PHI': 'PHI',
 'NO': 'NOP',
 'UTAH': 'UTA',
 'BOS': 'BOS',
 'PHX': 'PHX',
 'HOU': 'HOU',
 'MIL': 'MIL',
 'ORL': 'ORL',
 'SEA': 'OKC',
 'GS': 'GSW',
 'WSH': 'WAS',
 'NY': 'NYK',
 'ATL': 'ATL',
 'CHI': 'CHI',
 'MEM': 'MEM',
 'LAC': 'LAC',
 'MIA': 'MIA',
 'TOR': 'TOR',
 'DEN': 'DEN',
 'CLE': 'CLE',
 'CHA': 'CHA',
 'OKC': 'OKC',
 'BKN': 'BKN'
}
for year in range(2003, 2023):
    url = f'https://www.espn.com/nba/standings/_/season/{str(year)}/group/league'    
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content,"lxml")
    teams = soup.find_all('tbody')[0]
    team_rows = teams.find_all('tr')
    for i in team_rows:
        team = i.find('abbr').get_text()
        abbreviations.append(abbreviation_dict[team])
    standings = soup.find_all('tbody')[1]
    record_rows = standings.find_all('tr')
    for j in record_rows:
        record = float(j.find_all('td')[2].get_text())
        records.append(record)
        years.append(year)
        
record_df = pd.DataFrame({'Season': years, 'Player_Team': abbreviations, 'Team_Record': records})

## Part 2.2 - This Year's Estimates

In [3]:
personal_record_guess_west = {
 'PHX': 59,
 'GSW': 54,
 'MEM': 53,
 'DAL': 51,
 'DEN': 49,
 'LAC': 48,
 'MIN': 43,
 'NOP': 42,
    
 'LAL': 41,
 'SAC': 37,
    
 'POR': 34,
 'OKC': 31,
 'HOU': 28,
 'SAS': 24,
 'UTA': 21,
}

west_df = pd.DataFrame(personal_record_guess_west.items(), columns=['Player_Team', 'Team_Wins'])
west_df['Season'] = 2023
west_df['Team_Losses'] = 82 - west_df['Team_Wins']
west_df['Team_Record'] = round(west_df['Team_Wins'] / 82, 3)

personal_record_guess_east = {
 'BOS': 54,
 'PHI': 52,
 'MIA': 51,
 'MIL': 51,
 'ATL': 48,
 'BKN': 48,
 'CLE': 45,
 'CHI': 43,
    
 'TOR': 43,
 'NYK': 38,
    
 'CHA': 37,
 'WAS': 32,
 'IND': 26,
 'ORL': 24,
 'DET': 23
}

east_df = pd.DataFrame(personal_record_guess_east.items(), columns=['Player_Team', 'Team_Wins'])
east_df['Season'] = 2023
east_df['Team_Losses'] = 82 - east_df['Team_Wins']
east_df['Team_Record'] = round(east_df['Team_Wins'] / 82, 3)

# Part 3 - Export Both Data Frames

In [20]:
# 1 - All Records for Award Predictions
all_team_records_2023 = pd.concat([west_df, east_df]).reset_index().drop(['index'], axis = 1)[['Season', 'Player_Team', 'Team_Record']]
all_team_records_alltime = pd.concat([record_df, all_team_records_2023])
all_team_records_alltime.columns = ['Season', 'Team', 'Record']
all_team_records_alltime.to_csv('v1_alltime_team_predictions.csv')

In [17]:
# 2 - West Predictions
west_df['Seed'] = range(1,16)
west_df_ordered = west_df[['Seed', 'Player_Team', 'Team_Wins', 'Team_Losses']]
west_df_ordered.columns = ['Seed', 'Team', 'Wins', 'Losses']
west_df_ordered.to_csv('v1_2023_west_predictions.csv')

In [18]:
east_df['Seed'] = range(1,16)
east_df_ordered = east_df[['Seed', 'Player_Team', 'Team_Wins', 'Team_Losses']]
east_df_ordered.columns = ['Seed', 'Team', 'Wins', 'Losses']
east_df_ordered.to_csv('v1_2023_east_predictions.csv')