In [1]:
from bs4 import BeautifulSoup
import requests
import re
import sys
import pandas as pd
from sklearn import linear_model

In [2]:
def get_player_urls():
    index_url = "http://www.2kmtcentral.com/17/players/page/"
    player_urls = []
    x = 0
    valid = True
    while valid:
        request = requests.get(index_url + str(x))
        html = request.text;
        soup = BeautifulSoup(html, "html.parser")
        content = soup.find_all("a", class_ = "name", href = True)
        if len(content) == 0:
            valid = False
            break
        for a in content:
            # Have to encode and decode because Nene has a circumflex in his name
            url = a["href"].encode("utf-8") 
            player_urls.append(url)
        x += 1
    return player_urls

In [3]:
def get_player_data(player_urls):
    total_players = str(len(player_urls))
    player_obj_list = []
    for i, link in enumerate(player_urls):
        try:
            html = requests.get(link).text
            soup = BeautifulSoup(html, "html.parser")
            player_obj = {}
            player_obj["_id"] = int(link.split("/")[5])
            player_obj["Name"] = str(soup.find("table").find("td").text.encode("UTF-8"))
            player_obj["Overall"] = int(soup.find_all(class_ = "overall")[0].text)
            pos = soup.find("span", "position-primary")

            # Onyx cards don't have position-primary. Have to check old school.
            if pos != None:
                player_obj["Position"] = pos.text
                secondary = soup.find("sup", "position-secondary")
                if secondary != None:
                    player_obj["Secondary position"] = secondary.text.replace(u"\ufeff", "").strip()
            else:
                posLabel = pos = soup.find(class_ = "table-striped").find_all("th")[3]
                if pos.text == "Position": # Dynamic duos may mess up the order
                    pos = soup.find(class_ = "table-striped").find_all("td")[3]
                else:
                    pos = soup.find(class_ = "table-striped").find_all("td")[4]
                positions = pos.text.split("/")
                player_obj["Position"] = positions[0]
                if len(positions) > 1 :
                    # Get rid of feff
                    player_obj["Secondary position"] = positions[1].replace(u"\ufeff", "").strip()

            player_obj["Position"] = player_obj["Position"].replace(u"\ufeff", "").strip()
            # Some bs4 tables get wonky
            for elem in soup(text = re.compile(r"[5-7]\'1?[0-9]\"")):
                height_str =  elem.parent.text
                break
            height_str = re.split("[^0-9]", height_str)
            feet = height_str[0]
            inches = height_str[1]
            player_obj["Height"] = int(feet) * 12 + int(inches)

            # Look through blocks of attributes and parse out rating and name of rating
            player_obj["Overall"] = int(soup.find(class_ = "attribute-header").text[:2])
            for attributes in soup.find(class_ = "container-attributes").find_all(class_ = "attribute"):
                # 2kmtcentral has some hidden stuff when you shrink the screen
                # Match def. or off. or uppercase versions
                regex = "([Dd]ef|[Oo]ff)\." 
                stats = re.sub(regex, "", attributes.text)
                # Actual rating i.e. 89
                statVal = int(stats[:2])
                # Gotta account for on-ball d and pick & roll
                # Type of stat i.e. contested 3
                statName = str(re.search(r"\d\d((\s[^\+]+)+)", stats).group(1)).strip().replace(" ", "_") 
                player_obj[statName] = statVal
            player_obj_list.append(player_obj)
            if i % 100 == 0:
                print str(i) + " players added out of " + total_players
        except Exception, e:
            print str(e)
            print sys.exc_traceback.tb_lineno
            
    player_df = pd.DataFrame(player_obj_list)
    columns = player_df.columns.values
    rearranged_columns = columns[[53, 20, 35, 41, 14, 28, 27, 6, 22, 26, 5, 21, 42, 12, 23, 43, 49, 11, 48, 10, 4, 9, 36, 38,
                                  37, 13, 1, 32, 33, 31, 46, 45, 0, 52, 51, 47, 16, 29, 25, 19, 34, 15, 18, 30, 40, 50, 2, 
                                  44, 7, 24, 8, 3, 39, 17]]
    player_df = player_df[rearranged_columns]
    return player_df

In [4]:
# player_urls = get_player_urls()
# player_df = get_player_data(player_urls)
# player_df.to_csv("031417.csv", encoding = "UTF-8")
player_df = pd.read_csv("031417.csv").iloc[:, 1:]

In [5]:
for i, x in enumerate(player_df.columns):
    print i, x

0 _id
1 Name
2 Position
3 Secondary position
4 Height
5 Overall
6 Open_shot_mid
7 Contested_shot_mid
8 Off_dribble_shot_mid
9 Open_shot_3pt
10 Contested_shot_3pt
11 Off_dribble_shot_3pt
12 Shot_IQ
13 Free_throw
14 Offensive_consistency
15 Shot_close
16 Standing_layup
17 Driving_layup
18 Standing_dunk
19 Driving_dunk
20 Contact_dunk
21 Draw_foul
22 Post_control
23 Post_hook
24 Post_fadeaway
25 Hands
26 Ball_control
27 Passing_accuracy
28 Passing_vision
29 Passing_IQ
30 Speed_with_ball
31 Speed
32 Acceleration
33 Vertical
34 Strength
35 Stamina
36 Hustle
37 Overall_durability
38 On-ball_defense_IQ
39 Low_post_defense_IQ
40 Pick_&_roll_defense_IQ
41 Help_defense_IQ
42 Lateral_quickness
43 Pass_perception
44 Reaction_time
45 Steal
46 Block
47 Shot_contest
48 Defensive_consistency
49 Offensive_rebound
50 Defensive_rebound
51 Boxout
52 Potential
53 Intangibles


We proceed by using LASSO regression, which performs variable selection. This means that variables that are spurious, or in our case, redundant, will be set to 0. We will use 10-fold-cross-validation to find the best model. Note that I'm being lazy and not creating a test set. In this initial test, we are using intangibles, which is supposedly a hidden statistic. In addition, it should be mentioned that any models created will have some kind of variability--this is primarily because overall ratings are discrete; i.e. 95 and 96 instead of 95.1451765 and 96.51668918.

In [6]:
PGs = player_df[player_df["Position"] == "PG"].reset_index().drop("index", 1)
SGs = player_df[player_df["Position"] == "SG"].reset_index().drop("index", 1)
SFs = player_df[player_df["Position"] == "SF"].reset_index().drop("index", 1)
PFs = player_df[player_df["Position"] == "PF"].reset_index().drop("index", 1)
Cs = player_df[player_df["Position"] == "C"].reset_index().drop("index", 1)

overall_PG = linear_model.LassoCV(cv = 10)
overall_PG.fit(PGs.iloc[:, 6:53], PGs.iloc[:, 5])
print overall_PG.coef_

overall_SG = linear_model.LassoCV(cv = 10)
overall_SG.fit(SGs.iloc[:, 6:53], SGs.iloc[:, 5])
print overall_SG.coef_

overall_SF = linear_model.LassoCV(cv = 10)
overall_SF.fit(SFs.iloc[:, 6:53], SFs.iloc[:, 5])
print overall_SF.coef_

overall_PF = linear_model.LassoCV(cv = 10)
overall_PF.fit(PFs.iloc[:, 6:53], PFs.iloc[:, 5])
print overall_PF.coef_

overall_C = linear_model.LassoCV(cv = 10)
overall_C.fit(Cs.iloc[:, 6:53], Cs.iloc[:, 5])
print overall_SG.coef_

[ 0.07817677  0.01015573  0.          0.02104881  0.02401038  0.01991215
  0.04327408  0.00616868  0.08275802  0.03124395  0.          0.0337936  -0.
  0.0025231   0.          0.03406082  0.          0.01168954 -0.00039023
  0.01122173  0.03657617  0.08025324  0.05463707  0.02059106  0.00611249
  0.04899975  0.0121218   0.02968586  0.          0.          0.          0.
  0.01749185 -0.          0.04807432  0.02802564  0.          0.
  0.01701221  0.00069531  0.          0.          0.02859138  0.02646366
 -0.         -0.          0.15773216]
[ 0.05238117  0.02549295  0.02560605  0.04702443  0.01761466  0.03616396
  0.05998606  0.          0.07153649  0.03459132  0.          0.05658938
  0.00241024  0.01285332  0.          0.04659009  0.          0.         -0.
  0.01468108  0.010776    0.06843315  0.01668616  0.01483096  0.
  0.04624956  0.03212556  0.00395607  0.00675918  0.          0.         -0.
  0.04413675  0.00389872  0.          0.042725    0.00939164  0.02020946
  0.01196974 

To get a sense of what our model is doing, we can predict a couple of top players.

In [7]:
print pd.concat([PGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PG.predict(PGs.iloc[:, 6:53]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([SGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SG.predict(SGs.iloc[:, 6:53]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([SFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SF.predict(SFs.iloc[:, 6:53]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([PFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PF.predict(PFs.iloc[:, 6:53]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([Cs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_C.predict(Cs.iloc[:, 6:53]), columns = ["Prediction"])], axis = 1).head(15)

                 Name  Overall  Prediction
0          Jerry West       99   98.612331
1        Isiah Thomas       99   97.993831
2       Magic Johnson       98   96.276543
3        Walt Frazier       97   95.871682
4          Chris Paul       96   97.060187
5           Dave Bing       96   94.968564
6         Tony Parker       96   95.216865
7          Steve Nash       96   94.934872
8          Mark Price       96   93.893925
9      Penny Hardaway       95   94.915956
10  Russell Westbrook       95   94.746353
11      Magic Johnson       95   94.168638
12         Jason Kidd       95   95.863183
13     Nate Archibald       95   97.728564
14       Derrick Rose       95   94.645791
                Name  Overall  Prediction
0        Kobe Bryant       99  100.789269
1     Michael Jordan       99   98.445875
2        Kobe Bryant       99   98.190459
3      Tracy McGrady       97   96.757140
4     David Thompson       97   95.856410
5    Sidney Moncrief       97   95.412638
6     Mitch Richmo

Suppose, however, that intangibles is omitted from the modeling process. We can create a new set of models that achieves this.

In [8]:
overall_PG = linear_model.LassoCV(cv = 10)
overall_PG.fit(PGs.iloc[:, 6:52], PGs.iloc[:, 5])
print overall_PG.coef_

overall_SG = linear_model.LassoCV(cv = 10)
overall_SG.fit(SGs.iloc[:, 6:52], SGs.iloc[:, 5])
print overall_SG.coef_

overall_SF = linear_model.LassoCV(cv = 10)
overall_SF.fit(SFs.iloc[:, 6:52], SFs.iloc[:, 5])
print overall_SF.coef_

overall_PF = linear_model.LassoCV(cv = 10)
overall_PF.fit(PFs.iloc[:, 6:52], PFs.iloc[:, 5])
print overall_PF.coef_

overall_C = linear_model.LassoCV(cv = 10)
overall_C.fit(Cs.iloc[:, 6:52], Cs.iloc[:, 5])
print overall_SG.coef_

[ 0.0600623   0.0244821   0.          0.          0.02663962  0.03316288
  0.03031623  0.          0.11618497  0.02000584  0.          0.05603219
 -0.          0.          0.          0.04321719  0.          0.02382115
 -0.          0.01285527  0.03133616  0.09313447  0.06255095  0.          0.
  0.0535281   0.0041318   0.02807961  0.          0.          0.          0.
  0.01003494 -0.          0.03859342  0.02004088  0.01112598  0.
  0.02600086  0.00373369 -0.          0.          0.0516748   0.01330536
 -0.          0.        ]
[ 0.02253349  0.03993026  0.03511985  0.0321898   0.01815853  0.04480382
  0.04191407  0.          0.0933267   0.04038259  0.          0.04589727
  0.          0.01280721  0.          0.05708485  0.          0.          0.
  0.01628787  0.          0.06269841  0.02082858  0.02982099  0.
  0.05502139  0.01784884  0.01528875  0.00474476  0.          0.         -0.
  0.02629758  0.00135626  0.          0.03259355  0.01781871  0.02141252
  0.01960554  0.00113247 

In [9]:
print pd.concat([PGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PG.predict(PGs.iloc[:, 6:52]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([SGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SG.predict(SGs.iloc[:, 6:52]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([SFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SF.predict(SFs.iloc[:, 6:52]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([PFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PF.predict(PFs.iloc[:, 6:52]), columns = ["Prediction"])], axis = 1).head(15)

print pd.concat([Cs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_C.predict(Cs.iloc[:, 6:52]), columns = ["Prediction"])], axis = 1).head(15)

                 Name  Overall  Prediction
0          Jerry West       99   97.992870
1        Isiah Thomas       99   97.303108
2       Magic Johnson       98   96.089703
3        Walt Frazier       97   95.063575
4          Chris Paul       96   96.500448
5           Dave Bing       96   94.418843
6         Tony Parker       96   94.763453
7          Steve Nash       96   94.174706
8          Mark Price       96   93.034699
9      Penny Hardaway       95   94.480070
10  Russell Westbrook       95   93.830834
11      Magic Johnson       95   93.841797
12         Jason Kidd       95   95.659209
13     Nate Archibald       95   97.553960
14       Derrick Rose       95   94.302890
                Name  Overall  Prediction
0        Kobe Bryant       99  100.519730
1     Michael Jordan       99   97.999461
2        Kobe Bryant       99   97.418098
3      Tracy McGrady       97   96.614949
4     David Thompson       97   95.724900
5    Sidney Moncrief       97   95.045958
6     Mitch Richmo

From here, we can find the players who are projected to have a much higher rating than their actual rating suggests.

In [10]:
print pd.concat([PGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PG.predict(PGs.iloc[:, 6:52]), columns = ["Prediction"]),
                 PGs.iloc[:, 53],
                 pd.DataFrame(list((overall_PG.predict(PGs.iloc[:, 6:52]) - PGs.iloc[:, 5])), columns = ["Difference"])],
                axis = 1).sort_values(["Difference"], ascending = False).head(15)

print pd.concat([SGs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SG.predict(SGs.iloc[:, 6:52]), columns = ["Prediction"]),
                 SGs.iloc[:, 53],
                 pd.DataFrame(list((overall_SG.predict(SGs.iloc[:, 6:52]) - SGs.iloc[:, 5])), columns = ["Difference"])], 
                axis = 1).sort_values(["Difference"], ascending = False).head(15)

print pd.concat([SFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_SF.predict(SFs.iloc[:, 6:52]), columns = ["Prediction"]),
                 SFs.iloc[:, 53],
                 pd.DataFrame(list((overall_SF.predict(SFs.iloc[:, 6:52]) - SFs.iloc[:, 5])), columns = ["Difference"])], 
                axis = 1).sort_values(["Difference"], ascending = False).head(15)

print pd.concat([PFs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_PF.predict(PFs.iloc[:, 6:52]), columns = ["Prediction"]),
                 PFs.iloc[:, 53],
                 pd.DataFrame(list((overall_PF.predict(PFs.iloc[:, 6:52]) - PFs.iloc[:, 5])), columns = ["Difference"])], 
                axis = 1).sort_values(["Difference"], ascending = False).head(15)

print pd.concat([Cs.iloc[:, [1, 5]], 
                 pd.DataFrame(overall_C.predict(Cs.iloc[:, 6:52]), columns = ["Prediction"]),
                 Cs.iloc[:, 53],
                 pd.DataFrame(list((overall_C.predict(Cs.iloc[:, 6:52]) - Cs.iloc[:, 5])), columns = ["Difference"])], 
                axis = 1).sort_values(["Difference"], ascending = False).head(15)

                 Name  Overall  Prediction  Intangibles  Difference
192      Goran Dragic       79   82.349417           64    3.349417
247      Goran Dragic       75   78.228435           74    3.228435
90       Eric Bledsoe       86   89.151264           54    3.151264
279    Raymond Felton       70   73.084773           76    3.084773
207        Larry Drew       79   82.083555           70    3.083555
240      Jrue Holiday       76   79.017104           51    3.017104
280      Shelvin Mack       70   72.962962           74    2.962962
270         Ish Smith       72   74.785525           57    2.785525
138      Jrue Holiday       83   85.634143           51    2.634143
13     Nate Archibald       95   97.553960           40    2.553960
189       Jeff Teague       79   81.509012           45    2.509012
292  Tomas Satoransky       68   70.394436           66    2.394436
49       Eric Bledsoe       89   91.383238           54    2.383238
306        Tyler Ulis       66   68.379275      

Clearly, for the majority of these players, intangibles played a role in the difference between the actual and predicted overall rating.