# Initial Data Gathering and Cleaning

#### Web Scraper Documentation: https://medium.com/hardwood-convergence/intro-to-virtual-environments-and-scraping-nba-data-with-beautifulsoup-6ce745f8c26e

In [17]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

#### Initialize first year of draft data to be captured

In [18]:
year_one = '1980'

#### Web Scraper using BeautifulSoup for first year

In [19]:
url = 'https://www.basketball-reference.com/draft/NBA_' + year_one +'.html'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, features= 'html.parser')
b_text = soup.get_text()
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
headers = headers[1:]
headers[13:17] = ['MPG','PPG','RPG','APG']
rows = soup.findAll('tr')[2:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
df = pd.DataFrame(player_stats, columns = headers)
df = df.dropna(subset= ['Pk','Tm'])	

dy = []
for x in range(len(df)):
	dy.append(year_one)
df['Draft'] = dy

#### Web Scraper to iterate through all subsequent draft years

In [20]:
for x in range(1981,2011):
	url1 = 'https://www.basketball-reference.com/draft/NBA_' + str(x) + '.html'
	r1 = requests.get(url1)
	html_doc1 = r1.text
	soup1 = BeautifulSoup(html_doc1, features= 'html.parser')
	b_text1 = soup1.get_text()
	headers1 = [th.getText() for th in soup1.findAll('tr', limit=2)[1].findAll('th')]
	headers1 = headers1[1:]
	headers1[13:17] = ['MPG','PPG','RPG','APG']
	rows1 = soup1.findAll('tr')[2:]
	player_stats1 = [[td.getText() for td in rows1[i].findAll('td')]
            	for i in range(len(rows1))]
	temp = pd.DataFrame(player_stats1, columns = headers1)
	temp = temp.dropna(subset= ['Pk','Tm'])
	dy1 = []
	for n in range(len(temp)):
		dy1.append(str(x))
	temp['Draft'] = dy1
	df = df.append(temp, ignore_index= True)

#### Dataframe Info

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Pk       2938 non-null   object
 1   Tm       2938 non-null   object
 2   Player   2938 non-null   object
 3   College  2938 non-null   object
 4   Yrs      2938 non-null   object
 5   G        2938 non-null   object
 6   MP       2938 non-null   object
 7   PTS      2938 non-null   object
 8   TRB      2938 non-null   object
 9   AST      2938 non-null   object
 10  FG%      2938 non-null   object
 11  3P%      2938 non-null   object
 12  FT%      2938 non-null   object
 13  MPG      2938 non-null   object
 14  PPG      2938 non-null   object
 15  RPG      2938 non-null   object
 16  APG      2938 non-null   object
 17  WS       2938 non-null   object
 18  WS/48    2938 non-null   object
 19  BPM      2938 non-null   object
 20  VORP     2938 non-null   object
 21  Draft    2938 non-null   object
dtype

#### Handling null values and converting data types

In [22]:
for cat in headers:
	if cat not in ['Tm','Player','College']:
		df[cat] = pd.to_numeric(df[cat], errors = 'coerce')
		df[cat] = df[cat].fillna(0)

#### Explore and adjust team names to account for location changes

In [23]:
df.Tm.value_counts()

CHI    134
POR    134
DAL    129
PHO    127
GSW    126
PHI    125
ATL    122
SEA    116
SAS    116
IND    115
MIL    114
UTA    114
CLE    114
BOS    113
DEN    112
DET    111
NJN    108
NYK    103
HOU     99
WSB     99
LAL     92
LAC     77
SAC     71
MIN     54
KCK     48
MIA     48
ORL     43
SDC     30
TOR     26
WAS     26
CHH     24
MEM     21
CHA     14
VAN     13
NOH      9
OKC      6
NOK      5
Name: Tm, dtype: int64

In [24]:
alt_teams = {'SEA':'OKC','VAN':'MEM','WSB':'WAS','CHH':'CHA','NOK':'NOH','NJN':'BKN','SDC':'LAC','KCK':'SAC'}
df['Tm'] = df['Tm'].replace(alt_teams)

In [25]:
df.to_csv('draftdata.csv', index=False, encoding= 'utf-8')