# Gathering Data

In [20]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

# NBA season we will be analyzing
year = 2020

# URL page we will scraping
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)

# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html, features="lxml")

# use findALL() to get the column headers
soup.findAll('tr', limit=2)

# use getText()to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the
# ranking order from Basketball Reference for the analysis
headers = headers[1:]
#print(headers)

##Header Meaning
#Player = Name of Player
#Pos = Position
#Age = Age
#Tm = Team
#G = Games Played
#GS = Games Started
#MP = Minutes Played Per Game
#FG = Field Goals Per Game
#FGA = Field Goal Attempts Per Game
#FG% = Field Goal Percentage
#3P = 3-Point Field Goals Per Game
#3PA = 3-Point Field Goal Attempts Per Game
#3P% = 3-Point Field Goal Percentage
#2P = 2-Point Field Goals Per Game
#2PA = 2-Point Field Goal Attempts Per Game
#2P% = 2-Point Field Goal Percentage
#eFG% = Effective Field Goal Percentage
#FT = Free Throws Per Game
#FTA = Free Throw Attempts Per Game
#FT% = Free Throw Percentage
#ORB = Offensive Rebounds Per Game
#DRB = Defensive Rebounds Per Game
#TRB = Total Rebounds Per Game
#AST = Assists Per Game
#BLK = Blocks Per Game
#TOV = Turnover Per Game
#PF = Personal Fouls Per Game
#PTS = Points Per Game


# avoid the first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

stats = pd.DataFrame(player_stats, columns = headers)

stats.to_csv("stats.csv")



['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']


# Cleaning Data

In [105]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns

data = pd.read_csv('stats.csv')

del data['Unnamed: 0']

# Players that have gotten traded are placed in a different row with stats from their time on that specific team

TradedPlayers = []

# Placing players that were traded in a list called TradedPlayers
for i in range(len(data) - 1):
    temp = data.loc[i+1].Player
    if (temp == data.loc[i].Player) & (temp not in TradedPlayers):
        TradedPlayers.append(temp)

Nontraded_DF = data[data["Player"].isin(TradedPlayers) == False]

Traded_DF = data[data["Player"].isin(TradedPlayers)]

TradedTot_DF = Traded_DF[Traded_DF['Tm'] == 'TOT']

Combined_DF = Nontraded_DF.append(TradedTot_DF).reset_index()

del Combined_DF['index']

Combined_DF = Combined_DF.fillna(0)

# Column for Average Fantasy Points calculated based on columns based on different weights placed on different statline
# The weights I have put on each category is based on my own personal standing of each category

# Each Fantasy League can have their own weights on each category, I tried to make the weights as balanced as possible
# For example, power forwards and centers are able to get more blocks than the other positions while guards get more than steals
# than other positions

Combined_DF['AFP'] = 3*Combined_DF['3P'] + 2*Combined_DF['2P'] - Combined_DF['3PA'] - 0.5*Combined_DF['2PA'] 
+ Combined_DF['FT'] - 0.25*Combined_DF['FTA'] +  2*Combined_DF['ORB'] + Combined_DF['DRB'] + 2*Combined_DF['AST'] 
+ 4*Combined_DF['STL'] + 4*Combined_DF['BLK'] - 2.5*Combined_DF['TOV'] - Combined_DF['PF']

X = Combined_DF[['MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 
                 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]

y = Combined_DF['AFP']

reg = LinearRegression().fit(X,y)

coef = reg.coef_

# https://aegis4048.github.io/mutiple_linear_regression_and_visualization_in_python

[ 5.03470153e-16 -6.21827646e-15 -2.27438139e-15  1.63408148e-14
  3.00000000e+00 -1.00000000e+00  2.21332783e-15  2.00000000e+00
 -5.00000000e-01  9.81702919e-16 -1.26395256e-14 -8.78456225e-16
  1.07578478e-15 -3.31002773e-16 -1.49759406e-15 -1.45846938e-15
 -1.96723498e-16 -1.41836246e-15  7.42948822e-16  4.13983415e-16
 -2.41672643e-16  1.72748263e-16  1.86878808e-15]


# Data Visualization

In [None]:
# https://aegis4048.github.io/mutiple_linear_regression_and_visualization_in_python

