In [49]:
# Imports for scraping and storing as a data frame
import urllib2
import requests
import pandas as pd
import numpy as np

In [50]:
"""
A function that creates a list of all the years to scrape 
from dougstats.com.
"""
def get_years(start=99, stop=17):
    years = []
    end = (start + 1) % 100
    
    print "Filling years:"
    while start != stop:
        interval = time_interval(start, end)
        print interval
        years.append(interval)
        start = end
        end = (end + 1) % 100

    print ""
    return years

"""
Helper function to output the correct time interval
given a start time and and end time.
"""
def time_interval(start, end):
    if start == 99:
        return "99-00"
    elif start == 0:
        return "00-01"
    else:
        s, e = str(start), str(end)
        if len(s) == 1:
            s = "0" + s
        if len(e) == 1:
            e = "0" + e
        return s + "-" + e

"""
A function that takes the dougstats data and builds a pandas data frame
from it.
"""
def build_df(data):
    header = None
    lines = []
    for line in data.readlines():
        if not header:
            header = line.split() + ["YEAR"]
        else:
            stats = line.split()
            name = stats[0].split(",")
            try:
                stats[0] = str.upper(name[0] + ", " + name[1][0:3])
            except:
                stats[0] = str.upper(name[0])
            lines.append(stats + [year])
    return pd.DataFrame(lines, columns = header) 

In [51]:
years = get_years()
season_dfs = [] # A list of data frames for each year

for year in years:
    url = "http://www.dougstats.com/{}RD.txt".format(year)
    data = urllib2.urlopen(url)
    season_dfs.append(build_df(data))

df = pd.concat(season_dfs) # Join all the data frames together

Filling years:
99-00
00-01
01-02
02-03
03-04
04-05
05-06
06-07
07-08
08-09
09-10
10-11
11-12
12-13
13-14
14-15
15-16
16-17



In [55]:
# Now we can take a look at our data and see what needs to be cleaned.
df

Unnamed: 0,+/-,3A,3M,AS,BK,DQ,EJ,FF,FGA,FGM,...,PS,PTS,Player,ST,Sta,TC,TO,TR,Team,YEAR
0,,23,3,98,28,1,0,0,646,274,...,SG,697,"ABDUL-WAHAD, TAR",59,56,0,106,291,den,99-00
1,,96,29,271,87,3,0,0,1277,594,...,SF,1663,"ABDUR-RAHIM, SHA",89,82,10,249,825,van,99-00
2,,35,9,58,2,0,0,0,98,28,...,PG,82,"ALEXANDER, COR",24,2,0,28,42,den,99-00
3,,407,172,308,19,1,0,0,1411,642,...,SG,1809,"ALLEN, RAY",110,82,2,183,359,mil,99-00
4,,14,3,70,0,0,0,0,95,27,...,PG,60,"ALSTON, RAF",12,0,0,29,23,mil,99-00
5,,6,1,95,37,1,0,0,700,306,...,C,836,"AMAECHI, JOH",35,53,0,139,266,orl,99-00
6,,178,55,220,11,2,0,0,860,377,...,SG,1080,"ANDERSON, DER",90,58,0,167,258,lac,99-00
7,,220,85,420,8,4,0,0,986,434,...,PG,1149,"ANDERSON, KEN",139,82,11,130,225,bos,99-00
8,,397,132,123,16,0,0,0,782,306,...,SG,781,"ANDERSON, NIC",94,72,5,95,339,sac,99-00
9,,225,79,239,32,0,0,0,778,368,...,SG,1009,"ANDERSON, SHA",96,82,0,194,384,hou,99-00


In [52]:
from bs4 import BeautifulSoup

all_star_players = []

print "Scraping all star data from basketball.realgm.com"
for year in range(2000, 2017):
    url = "http://basketball.realgm.com/nba/allstar/game/rosters/{}".format(str(year))
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    tables = soup.find_all("table")
    print year

    for table in tables:
        rows = table.find_all('tr')[1:]
        for row in rows:
            cols = row.find_all('td')
            name = cols[0].get_text().split()
            name = name[1] + ", " + name[0][0:3]
            start = str((year - 1) % 100)
            end = str(year % 100)
            
            if len(start) == 1:
                start = "0" + start
            
            if len(end) == 1:
                end = "0" + end
                
            yr = start + "-" + end
            all_star_players.append([str.upper(str(name)), yr, 1])

print ""
all_star_players = pd.DataFrame(all_star_players, columns = ["Player", "YEAR", "AS?"]) 

Scraping all star data from basketball.realgm.com
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016



In [54]:
all_star_players

Unnamed: 0,Player,YEAR,AS?
0,"BRYANT, KOB",99-00,1
1,"DUNCAN, TIM",99-00,1
2,"GARNETT, KEV",99-00,1
3,"KIDD, JAS",99-00,1
4,"O'NEAL, SHA",99-00,1
5,"MALONE, KAR",99-00,1
6,"FINLEY, MIC",99-00,1
7,"PAYTON, GAR",99-00,1
8,"WALLACE, RAS",99-00,1
9,"WEBBER, CHR",99-00,1


In [56]:
master_df = pd.merge(df, all_star_players, how='left', on=["Player", "YEAR"])

In [57]:
master_df.columns

Index([u'+/-', u'3A', u'3M', u'AS', u'BK', u'DQ', u'EJ', u'FF', u'FGA', u'FGM',
       u'FTA', u'FTM', u'GP', u'Min', u'OR', u'PF', u'PS', u'PTS', u'Player',
       u'ST', u'Sta', u'TC', u'TO', u'TR', u'Team', u'YEAR', u'AS?'],
      dtype='object')

In [58]:
team_dfs = []
print "Scraping team data from dougstats.com:"
for year in years:
    print year
    url = "http://www.dougstats.com/{}RD.Team.txt".format(year)
    data = urllib2.urlopen(url)
    team_dfs.append(build_df(data))

records_df = pd.concat(team_dfs) # Join all the data frames together

Scraping team data from dougstats.com:
99-00
00-01
01-02
02-03
03-04
04-05
05-06
06-07
07-08
08-09
09-10
10-11
11-12
12-13
13-14
14-15
15-16
16-17


In [59]:
records_df["Team"] = records_df["team"].map(lambda x: str.lower(x[0:3]))
records_df["win_pct"] = pd.to_numeric(pd.to_numeric(records_df["won"]) / 82.0)
records_df = records_df[["Team", "win_pct", "YEAR"]]
records_df

Unnamed: 0,Team,win_pct,YEAR
0,atl,0.341463,99-00
1,bos,0.426829,99-00
2,cha,0.597561,99-00
3,chi,0.207317,99-00
4,cle,0.390244,99-00
5,dal,0.487805,99-00
6,den,0.426829,99-00
7,det,0.512195,99-00
8,gsw,0.231707,99-00
9,hou,0.414634,99-00


In [60]:
master_df = pd.merge(master_df, records_df, how='left', on=["Team", "YEAR"])

In [61]:
master_df['AS?'] = master_df['AS?'].fillna(0)
del master_df['+/-']

In [62]:
good_columns = ['GP', '3A', '3M', 'AS', 'BK', 'FGA', 'FGM', 'FTM', 'FTA', 'OR', 'PF', 'PTS', 'ST', 'Sta', 'TO', 'TR']
master_df[good_columns] = master_df[good_columns].apply(pd.to_numeric)
good_columns.append('win_pct')

In [63]:
master_df['3A'] /= master_df['GP']
master_df['3M'] /= master_df['GP']
master_df['AS'] /= master_df['GP']
master_df['BK'] /= master_df['GP']
master_df['FGA'] /= master_df['GP']
master_df['FGM'] /= master_df['GP']
master_df['FTM'] /= master_df['GP']
master_df['FTA'] /= master_df['GP']
master_df['OR'] /= master_df['GP']
master_df['PF'] /= master_df['GP']
master_df['PTS'] /= master_df['GP']
master_df['ST'] /= master_df['GP']
master_df['Sta'] /= master_df['GP']
master_df['TO'] /= master_df['GP']
master_df['TR'] /= master_df['GP']

In [64]:
master_df['win_pct'] = np.nan_to_num(master_df['win_pct'])

In [65]:
target = "AS?"

In [66]:
from sklearn.cross_validation import train_test_split

In [68]:
old_data = master_df[master_df["YEAR"] != "16-17"]
new_data = master_df[master_df["YEAR"] == "16-17"]

train = old_data.sample(frac=0.8, random_state=1)
test = old_data.loc[~old_data.index.isin(train.index)]

In [69]:
print(train.shape)
print(test.shape)

(6204, 27)
(1551, 27)


In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

model = LinearRegression()
# Fit the model to the training data.
model.fit(train[good_columns], train[target])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [71]:
# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(test[good_columns])

# Compute error between our test predictions and the actual values.
error = mean_squared_error(predictions, test[target])

print "ACCURACY: " + str(1 - error)

ACCURACY: 0.96835814773


In [76]:
new_data = new_data.reset_index()
arr = model.predict(new_data[good_columns])
top10 = arr.argsort()[-30:][::-1]

for i in top10:
    print new_data.ix[i]["Player"]

DAVIS, ANT
WESTBROOK, RUS
DURANT, KEV
HARDEN, JAM
JAMES, LEB
ANTETOKOUNMPO, GI
BUTLER, JIM
DEROZAN, DEM
THOMAS, ISA
COUSINS, DEM
LEONARD, KAW
LILLARD, DAM
HAYWARD, GOR
WALL, JOH
GRIFFIN, BLA
LOVE, KEV
GOBERT, RUD
GASOL, MAR
WALKER, KEM
IRVING, KYR
PAUL, CHR
CURRY, STE
LOWRY, KYL
WHITESIDE, HAS
HILL, GEO
WILLIAMS, LOU
JORDAN, DEA
CONLEY, MIK
TOWNS, KAR
GALLINARI, DAN


In [77]:
master_df.to_csv('player_data.csv')

In [78]:
new_data

Unnamed: 0,level_0,index,3A,3M,AS,BK,DQ,EJ,FF,FGA,...,Player,ST,Sta,TC,TO,TR,Team,YEAR,AS?,win_pct
0,0,7755,3.655172,1.310345,0.379310,0.103448,0,0,0,4.965517,...,"ABRINES, ALE",0.448276,0.000000,0,0.586207,1.172414,okl,16-17,0.0,0.256098
1,1,7756,1.333333,0.166667,0.000000,0.000000,0,0,0,2.833333,...,"ACY, QUI",0.000000,0.000000,0,0.333333,1.333333,dal,16-17,0.0,0.134146
2,2,7757,0.027778,0.000000,1.027778,0.944444,0,0,0,8.194444,...,"ADAMS, STE",1.250000,1.000000,2,1.916667,7.750000,okl,16-17,0.0,0.256098
3,3,7758,2.142857,0.785714,1.107143,0.035714,0,0,0,6.607143,...,"AFFLALO, ARR",0.285714,0.642857,0,0.500000,2.142857,sac,16-17,0.0,0.182927
4,4,7759,0.150000,0.000000,0.350000,0.700000,0,0,0,4.300000,...,"AJINCA, ALE",0.350000,0.500000,1,0.800000,4.500000,nor,16-17,0.0,0.170732
5,5,7760,0.000000,0.000000,0.516129,0.580645,1,0,0,1.935484,...,"ALDRICH, COL",0.612903,0.000000,0,0.225806,3.645161,min,16-17,0.0,0.134146
6,6,7761,0.781250,0.375000,2.093750,1.031250,0,0,0,13.812500,...,"ALDRIDGE, LAM",0.687500,1.000000,0,1.531250,7.250000,san,16-17,0.0,0.341463
7,7,7762,0.035714,0.000000,1.000000,0.285714,0,0,0,2.107143,...,"ALLEN, LAV",0.250000,0.000000,0,0.285714,2.500000,ind,16-17,0.0,0.219512
8,8,7763,0.612903,0.193548,1.032258,0.483871,0,0,0,8.580645,...,"ALLEN, TON",1.709677,0.935484,1,1.419355,5.419355,mem,16-17,0.0,0.268293
9,9,7764,3.526316,0.947368,2.157895,0.473684,1,0,0,6.842105,...,"AMINU, AL-",1.157895,0.842105,0,1.473684,6.157895,por,16-17,0.0,0.182927


In [39]:
new_df.columns

Index([u'Player', u'Team', u'PS', u'GP', u'Min', u'FGM', u'FGA', u'3M', u'3A',
       u'FTM', u'FTA', u'OR', u'TR', u'AS', u'ST', u'TO', u'BK', u'PF', u'DQ',
       u'PTS', u'TC', u'EJ', u'FF', u'Sta', u'+/-', u'YEAR', u'team', u'won',
       u'lost', u'min', u'fgm', u'fga', u'3m', u'3a', u'ftm', u'fta', u'or',
       u'tr', u'as', u'st', u'to', u'bk', u'pf', u'pts', u'tc', u'ej', u'ff',
       u'win_pct'],
      dtype='object')