In [291]:
import pandas as pd
import bs4
import requests
import re
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline



In [2]:
url = 'https://basketball.realgm.com/nba/hall-of-fame'
page = requests.get(url)
page.content

b'<!DOCTYPE html>\r\n<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\r\n<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->\r\n<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->\r\n<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->\r\n   <head>\r\n      <meta http-equiv="content-type" content="text/html; charset=utf-8">\n<title>NBA Hall of Famers - RealGM</title>\n<meta name="keywords" content="NBA Hall of Famers, RealGM">\n<meta name="description" content="NBA Hall of Famers - RealGM">\n<meta property="og:title" content="NBA Hall of Famers - RealGM" />\n<meta property="og:type" content="website" />\n<meta property="og:url" content="https://basketball.realgm.com/nba/hall-of-fame" />\n<meta property="og:description" content="NBA Hall of Famers - RealGM" />\n<meta property="article:publisher" content="https://www.facebook.com/RealGM">\n<meta property="fb:app_id" content="305642309526726">\n<meta name="viewport" conte

In [3]:
soup = bs4.BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<title>NBA Hall of Famers - RealGM</title>
<meta content="NBA Hall of Famers, RealGM" name="keywords"/>
<meta content="NBA Hall of Famers - RealGM" name="description"/>
<meta content="NBA Hall of Famers - RealGM" property="og:title">
<meta content="website" property="og:type">
<meta content="https://basketball.realgm.com/nba/hall-of-fame" property="og:url">
<meta content="NBA Hall of Famers - RealGM" property="og:description"/>
<meta content="https://www.facebook.com/RealGM" property="article:publisher"/>
<meta content="305642309526726" property="fb:app_id"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1

In [4]:
soup.find_all(href=re.compile("/player/"))

[<a href="/player/Kareem-Abdul-Jabbar/Summary/4629">Kareem Abdul-Jabbar</a>,
 <a href="/player/Ray-Allen/Summary/606">Ray Allen</a>,
 <a href="/player/Tiny-Archibald/Summary/64274">Tiny Archibald</a>,
 <a href="/player/Paul-Arizin/Summary/66196">Paul Arizin</a>,
 <a href="/player/Al-Attles/Summary/66847">Al Attles</a>,
 <a href="/player/Charles-Barkley/Summary/1259">Charles Barkley</a>,
 <a href="/player/Rick-Barry/Summary/65859">Rick Barry</a>,
 <a href="/player/Elgin-Baylor/Summary/65978">Elgin Baylor</a>,
 <a href="/player/Zelmo-Beaty/Summary/66186">Zelmo Beaty</a>,
 <a href="/player/Walt-Bellamy/Summary/66188">Walt Bellamy</a>,
 <a href="/player/Dave-Bing/Summary/65948">Dave Bing</a>,
 <a href="/player/Larry-Bird/Summary/4617">Larry Bird</a>,
 <a href="/player/Bill-Bradley/Summary/66051">Bill Bradley</a>,
 <a href="/player/Carl-Braun/Summary/66361">Carl Braun</a>,
 <a href="/player/Al-Cervi/Summary/66406">Al Cervi</a>,
 <a href="/player/Wilt-Chamberlain/Summary/65950">Wilt Chamberl

In [5]:
hof_players = []
for player in soup.find_all(href=re.compile("/player/")):
    hof_players.append(player.text)

In [46]:
hof_players.index('Frank Ramsey')

99

In [7]:
soup.find_all('h2')

[<h2 style="font-weight: bold;">RealGM Trade Checker™</h2>,
 <h2 class="page_title" style="line-height: 42px;">
 <img src="/images/basketball/5.0/team_logos/nba/nba_40.gif" style="float: left; margin-right: 0.5em;"> 
 Hall of Famers
 </img></h2>,
 <h2>Hall of Fame Players</h2>,
 <h2>Hall of Fame Coaches</h2>]

In [9]:
players_df = pd.read_csv('player_data.csv')
players_df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [53]:
seasons_df = pd.read_csv('Seasons_stats.csv')
seasons_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


In [54]:
seasons_df.drop('Unnamed: 0', axis=1, inplace=True)

In [55]:
seasons_df.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0


In [13]:
seasons_df.groupby(['Player']).sum()

Unnamed: 0_level_0,Year,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,35890.0,538.0,1361.0,905.0,39044.0,249.7,9.784,0.882,7.868,187.0,...,12.745,3576.0,6553.0,10129.0,1469.0,1103.0,562.0,1508.0,2581.0,12928.0
A.J. Bramlett,2000.0,23.0,8.0,0.0,61.0,-0.4,0.190,0.000,0.000,21.7,...,0.000,12.0,10.0,22.0,0.0,1.0,0.0,3.0,13.0,8.0
A.J. English,3983.0,47.0,151.0,18.0,3108.0,23.1,0.960,0.094,0.482,9.8,...,1.548,140.0,175.0,315.0,320.0,57.0,24.0,203.0,287.0,1502.0
A.J. Guyton,6006.0,69.0,80.0,14.0,1246.0,13.1,0.972,1.113,0.205,4.1,...,1.648,22.0,58.0,80.0,147.0,20.0,12.0,62.0,58.0,442.0
A.J. Hammons,2017.0,24.0,22.0,0.0,163.0,8.4,0.472,0.238,0.476,5.4,...,0.450,8.0,28.0,36.0,4.0,1.0,13.0,10.0,21.0,48.0
A.J. Price,18120.0,237.0,287.0,25.0,4253.0,93.5,3.915,4.104,1.412,17.9,...,5.058,74.0,335.0,409.0,613.0,132.0,9.0,243.0,252.0,1656.0
A.J. Wynder,1991.0,26.0,6.0,0.0,39.0,7.6,0.387,0.083,0.667,3.1,...,0.750,1.0,2.0,3.0,8.0,1.0,0.0,4.0,1.0,12.0
A.W. Holt,1971.0,24.0,6.0,0.0,14.0,-1.8,0.215,0.000,0.375,0.0,...,0.667,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,4.0
Aaron Brooks,30189.0,414.0,797.0,226.0,17088.0,184.9,7.684,6.326,2.710,33.6,...,11.732,323.0,1037.0,1360.0,2485.0,469.0,113.0,1272.0,1544.0,7839.0
Aaron Gordon,6048.0,60.0,205.0,117.0,4958.0,42.8,1.588,0.785,0.877,20.7,...,2.108,316.0,765.0,1081.0,311.0,144.0,117.0,193.0,408.0,1981.0


In [20]:
seasons_df.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [23]:
avg_list = ['Player', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
           'WS/48', 'OBPM', 'DBPM', 'BPM', 'FG%', '3P%', '2P%', 'eFG%', 'FT%']

In [68]:
seasons_avg_df = seasons_df[avg_list]

In [25]:
seasons_avg_df.head()

Unnamed: 0,Player,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,...,USG%,WS/48,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%
0,Curly Armstrong,,0.368,,0.467,,,,,,...,,,,,,0.279,,0.279,0.279,0.705
1,Cliff Barker,,0.435,,0.387,,,,,,...,,,,,,0.372,,0.372,0.372,0.708
2,Leo Barnhorst,,0.394,,0.259,,,,,,...,,,,,,0.349,,0.349,0.349,0.698
3,Ed Bartels,,0.312,,0.395,,,,,,...,,,,,,0.256,,0.256,0.256,0.559
4,Ed Bartels,,0.308,,0.378,,,,,,...,,,,,,0.256,,0.256,0.256,0.548


In [69]:
seasons_avg_df.groupby(['Player']).mean()

Unnamed: 0_level_0,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,...,USG%,WS/48,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,13.872222,0.543556,0.049000,0.437111,10.388889,18.811111,14.655556,5.216667,1.438889,0.850000,...,14.405556,0.122111,-0.505556,0.422222,-0.083333,0.487667,0.146833,0.501889,0.493167,0.708056
A.J. Bramlett,-0.400000,0.190000,0.000000,0.000000,21.700000,18.500000,20.100000,0.000000,0.800000,0.000000,...,17.100000,-0.129000,-10.500000,-6.300000,-16.800000,0.190000,,0.190000,0.190000,
A.J. English,11.550000,0.480000,0.047000,0.241000,4.900000,6.250000,5.550000,15.850000,0.900000,0.450000,...,23.700000,0.016500,-2.250000,-2.850000,-5.100000,0.436000,0.136500,0.450500,0.438500,0.774000
A.J. Guyton,4.366667,0.324000,0.371000,0.068333,1.366667,3.800000,2.533333,23.466667,2.466667,0.500000,...,20.766667,-0.141333,-6.066667,-4.933333,-11.000000,0.255667,0.255000,0.254000,0.310667,0.824000
A.J. Hammons,8.400000,0.472000,0.238000,0.476000,5.400000,20.900000,12.800000,3.800000,0.300000,7.200000,...,17.600000,-0.001000,-7.500000,1.900000,-5.600000,0.405000,0.500000,0.375000,0.464000,0.450000
A.J. Price,10.388889,0.435000,0.456000,0.156889,1.988889,9.200000,5.577778,22.733333,1.277778,0.088889,...,20.833333,0.021444,-1.511111,-2.655556,-4.144444,0.355222,0.242889,0.444667,0.412111,0.632250
A.J. Wynder,7.600000,0.387000,0.083000,0.667000,3.100000,5.200000,4.200000,24.000000,1.300000,0.000000,...,21.100000,-0.051000,-5.900000,-3.500000,-9.400000,0.250000,0.000000,0.273000,0.250000,0.750000
A.W. Holt,-1.800000,0.215000,,0.375000,,,13.500000,0.000000,,,...,,-0.398000,,,,0.125000,,0.125000,0.125000,0.667000
Aaron Brooks,12.326667,0.512267,0.421733,0.180667,2.240000,6.460000,4.353333,23.840000,1.400000,0.880000,...,22.153333,0.052333,-0.100000,-2.473333,-2.566667,0.403133,0.354467,0.437067,0.477667,0.838000
Aaron Gordon,14.266667,0.529333,0.261667,0.292333,6.900000,17.733333,12.233333,9.033333,1.433333,2.000000,...,17.633333,0.091667,-0.800000,0.266667,-0.566667,0.458000,0.285000,0.519667,0.495333,0.702667


In [27]:
sum_list = ['Player', 'G', 'GS', 'MP', 'OWS', 'DWS', 'WS', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB',
           'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

In [70]:
seasons_sum_df = seasons_df[sum_list]
seasons_sum_grouped_df = seasons_sum_df.groupby(['Player']).sum()

In [31]:
edited_player_df = players_df[['name', 'year_start', 'year_end', 'position']]
edited_player_df.head()

Unnamed: 0,name,year_start,year_end,position
0,Alaa Abdelnaby,1991,1995,F-C
1,Zaid Abdul-Aziz,1969,1978,C-F
2,Kareem Abdul-Jabbar,1970,1989,C
3,Mahmoud Abdul-Rauf,1991,2001,G
4,Tariq Abdul-Wahad,1998,2003,F


In [32]:
edited_player_df.rename(columns={'name': 'Player'}, inplace=True)
edited_player_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,Player,year_start,year_end,position
0,Alaa Abdelnaby,1991,1995,F-C
1,Zaid Abdul-Aziz,1969,1978,C-F
2,Kareem Abdul-Jabbar,1970,1989,C
3,Mahmoud Abdul-Rauf,1991,2001,G
4,Tariq Abdul-Wahad,1998,2003,F


In [33]:
edited_player_df.set_index('Player', inplace=True)
edited_player_df.head()

Unnamed: 0_level_0,year_start,year_end,position
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alaa Abdelnaby,1991,1995,F-C
Zaid Abdul-Aziz,1969,1978,C-F
Kareem Abdul-Jabbar,1970,1989,C
Mahmoud Abdul-Rauf,1991,2001,G
Tariq Abdul-Wahad,1998,2003,F


In [74]:
dataset_df = edited_player_df.join(seasons_sum_df.groupby(['Player']).sum(), on='Player', how='inner')
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,G,GS,MP,OWS,DWS,WS,FG,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,385.0,105.0,5017.0,1.0,6.2,7.1,983.0,...,472.0,446.0,851.0,1297.0,125.0,111.0,107.0,389.0,777.0,2299.0
Zaid Abdul-Aziz,1969,1978,C-F,570.0,0.0,12126.0,4.8,12.5,17.4,1936.0,...,1536.0,522.0,1155.0,4524.0,648.0,134.0,208.0,28.0,1264.0,4978.0
Kareem Abdul-Jabbar,1970,1989,C,1560.0,625.0,57446.0,179.0,94.5,273.3,15837.0,...,9304.0,2975.0,9394.0,17440.0,5660.0,1160.0,3189.0,2527.0,4657.0,38387.0
Mahmoud Abdul-Rauf,1991,2001,G,586.0,336.0,15628.0,16.9,8.4,25.1,3514.0,...,1161.0,219.0,868.0,1087.0,2079.0,487.0,46.0,963.0,1106.0,8553.0
Tariq Abdul-Wahad,1998,2003,F,321.0,213.0,6826.0,-0.6,6.2,5.5,1049.0,...,755.0,428.0,723.0,1151.0,388.0,263.0,121.0,442.0,688.0,2662.0
Shareef Abdur-Rahim,1997,2008,F,915.0,760.0,31566.0,56.3,22.9,79.2,5935.0,...,5372.0,2058.0,4820.0,6878.0,2283.0,888.0,675.0,2320.0,2546.0,16412.0
Tom Abernethy,1977,1981,F,358.0,0.0,5732.0,8.6,5.1,13.7,749.0,...,465.0,394.0,665.0,1059.0,403.0,192.0,63.0,137.0,559.0,1842.0
Forest Able,1957,1957,G,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Alex Abrines,2017,2018,G-F,68.0,6.0,1055.0,1.2,0.9,2.1,134.0,...,49.0,18.0,68.0,86.0,40.0,37.0,8.0,33.0,114.0,406.0
Alex Acker,2006,2009,G,55.0,0.0,433.0,-0.4,0.1,-0.2,64.0,...,20.0,17.0,36.0,53.0,28.0,11.0,8.0,18.0,22.0,153.0


In [75]:
dataset_df = dataset_df.join(seasons_avg_df.groupby(['Player']).mean(), on='Player', how='inner')

In [47]:
seasons_df[seasons_df['Player'] == 'Kareem Abdul-Jabbar*']

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
2868,1970.0,Kareem Abdul-Jabbar*,C,22.0,MIL,82.0,,3534.0,22.5,0.552,...,0.653,,,1190.0,337.0,,,,283.0,2361.0
3070,1971.0,Kareem Abdul-Jabbar*,C,23.0,MIL,82.0,,3288.0,29.0,0.606,...,0.69,,,1311.0,272.0,,,,264.0,2596.0
3316,1972.0,Kareem Abdul-Jabbar*,C,24.0,MIL,81.0,,3583.0,29.9,0.603,...,0.689,,,1346.0,370.0,,,,235.0,2822.0
3582,1973.0,Kareem Abdul-Jabbar*,C,25.0,MIL,76.0,,3254.0,28.5,0.58,...,0.713,,,1224.0,379.0,,,,208.0,2292.0
3852,1974.0,Kareem Abdul-Jabbar*,C,26.0,MIL,81.0,,3548.0,24.4,0.564,...,0.702,287.0,891.0,1178.0,386.0,112.0,283.0,,238.0,2191.0
4098,1975.0,Kareem Abdul-Jabbar*,C,27.0,MIL,65.0,,2747.0,26.4,0.55,...,0.763,194.0,718.0,912.0,264.0,65.0,212.0,,205.0,1949.0
4375,1976.0,Kareem Abdul-Jabbar*,C,28.0,LAL,82.0,,3379.0,27.2,0.567,...,0.703,272.0,1111.0,1383.0,413.0,119.0,338.0,,292.0,2275.0
4650,1977.0,Kareem Abdul-Jabbar*,C,29.0,LAL,82.0,,3016.0,27.8,0.608,...,0.701,266.0,824.0,1090.0,319.0,101.0,261.0,,262.0,2152.0
5010,1978.0,Kareem Abdul-Jabbar*,C,30.0,LAL,62.0,,2265.0,29.2,0.589,...,0.783,186.0,615.0,801.0,269.0,103.0,185.0,208.0,182.0,1600.0
5382,1979.0,Kareem Abdul-Jabbar*,C,31.0,LAL,80.0,,3157.0,25.5,0.612,...,0.736,207.0,818.0,1025.0,431.0,76.0,316.0,282.0,230.0,1903.0


In [65]:
seasons_df['Player'] = seasons_df['Player'].apply(lambda x: re.sub('\*', "", x))

In [56]:
seasons_df.dropna(inplace=True, how='all')

In [67]:
seasons_df[seasons_df['Player'] == 'Kareem Abdul-Jabbar']

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
2868,1970.0,Kareem Abdul-Jabbar,C,22.0,MIL,82.0,,3534.0,22.5,0.552,...,0.653,,,1190.0,337.0,,,,283.0,2361.0
3070,1971.0,Kareem Abdul-Jabbar,C,23.0,MIL,82.0,,3288.0,29.0,0.606,...,0.69,,,1311.0,272.0,,,,264.0,2596.0
3316,1972.0,Kareem Abdul-Jabbar,C,24.0,MIL,81.0,,3583.0,29.9,0.603,...,0.689,,,1346.0,370.0,,,,235.0,2822.0
3582,1973.0,Kareem Abdul-Jabbar,C,25.0,MIL,76.0,,3254.0,28.5,0.58,...,0.713,,,1224.0,379.0,,,,208.0,2292.0
3852,1974.0,Kareem Abdul-Jabbar,C,26.0,MIL,81.0,,3548.0,24.4,0.564,...,0.702,287.0,891.0,1178.0,386.0,112.0,283.0,,238.0,2191.0
4098,1975.0,Kareem Abdul-Jabbar,C,27.0,MIL,65.0,,2747.0,26.4,0.55,...,0.763,194.0,718.0,912.0,264.0,65.0,212.0,,205.0,1949.0
4375,1976.0,Kareem Abdul-Jabbar,C,28.0,LAL,82.0,,3379.0,27.2,0.567,...,0.703,272.0,1111.0,1383.0,413.0,119.0,338.0,,292.0,2275.0
4650,1977.0,Kareem Abdul-Jabbar,C,29.0,LAL,82.0,,3016.0,27.8,0.608,...,0.701,266.0,824.0,1090.0,319.0,101.0,261.0,,262.0,2152.0
5010,1978.0,Kareem Abdul-Jabbar,C,30.0,LAL,62.0,,2265.0,29.2,0.589,...,0.783,186.0,615.0,801.0,269.0,103.0,185.0,208.0,182.0,1600.0
5382,1979.0,Kareem Abdul-Jabbar,C,31.0,LAL,80.0,,3157.0,25.5,0.612,...,0.736,207.0,818.0,1025.0,431.0,76.0,316.0,282.0,230.0,1903.0


In [165]:
hof_players

['Kareem Abdul-Jabbar',
 'Ray Allen',
 'Tiny Archibald',
 'Paul Arizin',
 'Al Attles',
 'Charles Barkley',
 'Rick Barry',
 'Elgin Baylor',
 'Zelmo Beaty',
 'Walt Bellamy',
 'Dave Bing',
 'Larry Bird',
 'Bill Bradley',
 'Carl Braun',
 'Al Cervi',
 'Wilt Chamberlain',
 'Mo Cheeks',
 'Chuck Cooper',
 'Bob Cousy',
 'Dave Cowens',
 'Billy Cunningham',
 'Louie Dampier',
 'Mel Daniels',
 'Adrian Dantley',
 'Bob Davies',
 'Dave DeBusschere',
 'Vlade Divac',
 'Clyde Drexler',
 'Joe Dumars',
 'Alex English',
 'Julius Erving',
 'Patrick Ewing',
 'Walt Frazier',
 'Joe Fulks',
 'Nick Galis',
 'Harry Gallatin',
 'George Gervin',
 'Artis Gilmore',
 'Tom Gola',
 'Gail Goodrich',
 'Hal Greer',
 'Richie Guerin',
 'Cliff Hagan',
 'John Havlicek',
 'Connie Hawkins',
 'Elvin Hayes',
 'Spencer Haywood',
 'Tommy Heinsohn',
 'Grant Hill',
 'Bob Houbregs',
 'Bailey Howell',
 'Dan Issel',
 'Allen Iverson',
 'Buddy Jeannette',
 'Dennis Johnson',
 'Gus Johnson',
 'Magic Johnson',
 'Neil Johnston',
 'Bobby Jones',

In [76]:
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,G,GS,MP,OWS,DWS,WS,FG,...,USG%,WS/48,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,385.0,105.0,5017.0,1.0,6.2,7.1,983.0,...,22.433333,-0.004000,-5.744444,-2.133333,-7.888889,0.449333,0.000000,0.451444,0.449333,0.671375
Zaid Abdul-Aziz,1969,1978,C-F,570.0,0.0,12126.0,4.8,12.5,17.4,1936.0,...,23.000000,0.045000,-2.171429,-0.328571,-2.485714,0.399500,,0.399500,0.399500,0.674714
Kareem Abdul-Jabbar,1970,1989,C,1560.0,625.0,57446.0,179.0,94.5,273.3,15837.0,...,24.208333,0.220500,3.518750,1.825000,5.331250,0.558350,0.033300,0.558750,0.558350,0.727700
Mahmoud Abdul-Rauf,1991,2001,G,586.0,336.0,15628.0,16.9,8.4,25.1,3514.0,...,25.588889,0.061556,0.333333,-3.377778,-3.055556,0.439778,0.316444,0.458333,0.466889,0.893111
Tariq Abdul-Wahad,1998,2003,F,321.0,213.0,6826.0,-0.6,6.2,5.5,1049.0,...,18.770000,0.015200,-3.520000,0.450000,-3.060000,0.369000,0.291333,0.373700,0.373700,0.617900
Shareef Abdur-Rahim,1997,2008,F,915.0,760.0,31566.0,56.3,22.9,79.2,5935.0,...,23.900000,0.111000,0.621429,-0.421429,0.207143,0.454929,0.286538,0.462357,0.460929,0.829500
Tom Abernethy,1977,1981,F,358.0,0.0,5732.0,8.6,5.1,13.7,749.0,...,11.600000,0.087714,-1.628571,-0.114286,-1.742857,0.452000,0.000000,0.454286,0.452000,0.691286
Forest Able,1957,1957,G,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,-2.062000,,,,0.000000,,0.000000,0.000000,
Alex Abrines,2017,2018,G-F,68.0,6.0,1055.0,1.2,0.9,2.1,134.0,...,15.900000,0.095000,-0.300000,-2.200000,-2.500000,0.393000,0.381000,0.426000,0.531000,0.898000
Alex Acker,2006,2009,G,55.0,0.0,433.0,-0.4,0.1,-0.2,64.0,...,23.700000,-0.061250,-5.200000,-1.525000,-6.750000,0.352250,0.247000,0.410750,0.385000,0.500000


In [77]:
def hall_of_fame(name):
    if name in hof_players:
        if name == 'Bill Bradley':
            if dataset_df['year_end']
        return 'Yes'
    else:
        return 'No'

In [172]:
dataset_df.index[]

'Alaa Abdelnaby'

In [82]:
dataset_df['Hall of Fame'] = [hall_of_fame(name) for name in dataset_df.index]

In [167]:
dataset_df[dataset_df['Hall of Fame'] == 'Yes'].duplicated(['Player'])

Player
Kareem Abdul-Jabbar    False
Ray Allen              False
Tiny Archibald         False
Paul Arizin            False
Al Attles              False
Charles Barkley        False
Rick Barry             False
Elgin Baylor           False
Zelmo Beaty            False
Walt Bellamy           False
Dave Bing              False
Larry Bird             False
Bill Bradley           False
Bill Bradley            True
Carl Braun             False
Al Cervi               False
Wilt Chamberlain       False
Chuck Cooper           False
Bob Cousy              False
Dave Cowens            False
Billy Cunningham       False
Louie Dampier          False
Mel Daniels            False
Adrian Dantley         False
Bob Davies             False
Dave DeBusschere       False
Vlade Divac            False
Clyde Drexler          False
Joe Dumars             False
Alex English           False
                       ...  
Arnie Risen            False
Oscar Robertson        False
David Robinson         False
Guy Rod

In [87]:
dataset_df.duplicated(['Player', 'year_start'])

KeyError: Index(['Player'], dtype='object')

In [88]:
dataset_df['Player'] = dataset_df.index

In [164]:
dataset_df[dataset_df['Player'] == 'Bill Bradley' and dataset_df['year_end'] == 1968]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

'Yes'

In [152]:
dataset_df.loc[dataset_df['year_end'] == 1968]

KeyError: 'Bill Bradley'

In [171]:
hall_of_fame_df[hall_of_fame_df.duplicated(['Player'])]

Unnamed: 0_level_0,year_start,year_end,position,G,GS,MP,OWS,DWS,WS,FG,...,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%,Hall of Fame,Player
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bill Bradley,1968,1977,F-G,742.0,0.0,22799.0,15.4,23.3,38.8,3927.0,...,-1.85,-0.65,-2.525,0.4466,,0.4466,0.4466,0.8347,Yes,Bill Bradley
Patrick Ewing,2011,2011,F,1190.0,1122.0,40613.0,44.8,81.5,126.4,9702.0,...,-1.416667,2.122222,0.733333,0.467833,0.083167,0.469833,0.468111,0.734111,Yes,Patrick Ewing
Bobby Jones,2007,2008,F,912.0,132.0,21575.0,40.7,34.8,75.3,3575.0,...,-0.564706,1.335294,0.770588,0.497059,0.113308,0.530765,0.513353,0.767467,Yes,Bobby Jones
Gary Payton,2017,2018,G,1421.0,1313.0,50424.0,103.3,51.1,154.4,9381.0,...,2.59,-0.16,2.43,0.4555,0.2751,0.4985,0.4837,0.71855,Yes,Gary Payton


In [203]:
players = list(dataset_df.index)

In [196]:
index_list.index('Gary Payton')

2723

In [197]:
dataset_df.index[2723:2725]

Index(['Gary Payton', 'Gary Payton'], dtype='object', name='Player')

In [191]:
dataset_df.iloc[1038, 47] = 'No'

In [195]:
dataset_df.iloc[1807, 47] = 'No'

In [199]:
dataset_df.iloc[2724, 47] = 'No'

In [201]:
dataset_df.drop('Player', axis=1, inplace=True)

In [202]:
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,G,GS,MP,OWS,DWS,WS,FG,...,WS/48,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%,Hall of Fame
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,385.0,105.0,5017.0,1.0,6.2,7.1,983.0,...,-0.004000,-5.744444,-2.133333,-7.888889,0.449333,0.000000,0.451444,0.449333,0.671375,No
Zaid Abdul-Aziz,1969,1978,C-F,570.0,0.0,12126.0,4.8,12.5,17.4,1936.0,...,0.045000,-2.171429,-0.328571,-2.485714,0.399500,,0.399500,0.399500,0.674714,No
Kareem Abdul-Jabbar,1970,1989,C,1560.0,625.0,57446.0,179.0,94.5,273.3,15837.0,...,0.220500,3.518750,1.825000,5.331250,0.558350,0.033300,0.558750,0.558350,0.727700,Yes
Mahmoud Abdul-Rauf,1991,2001,G,586.0,336.0,15628.0,16.9,8.4,25.1,3514.0,...,0.061556,0.333333,-3.377778,-3.055556,0.439778,0.316444,0.458333,0.466889,0.893111,No
Tariq Abdul-Wahad,1998,2003,F,321.0,213.0,6826.0,-0.6,6.2,5.5,1049.0,...,0.015200,-3.520000,0.450000,-3.060000,0.369000,0.291333,0.373700,0.373700,0.617900,No
Shareef Abdur-Rahim,1997,2008,F,915.0,760.0,31566.0,56.3,22.9,79.2,5935.0,...,0.111000,0.621429,-0.421429,0.207143,0.454929,0.286538,0.462357,0.460929,0.829500,No
Tom Abernethy,1977,1981,F,358.0,0.0,5732.0,8.6,5.1,13.7,749.0,...,0.087714,-1.628571,-0.114286,-1.742857,0.452000,0.000000,0.454286,0.452000,0.691286,No
Forest Able,1957,1957,G,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-2.062000,,,,0.000000,,0.000000,0.000000,,No
Alex Abrines,2017,2018,G-F,68.0,6.0,1055.0,1.2,0.9,2.1,134.0,...,0.095000,-0.300000,-2.200000,-2.500000,0.393000,0.381000,0.426000,0.531000,0.898000,No
Alex Acker,2006,2009,G,55.0,0.0,433.0,-0.4,0.1,-0.2,64.0,...,-0.061250,-5.200000,-1.525000,-6.750000,0.352250,0.247000,0.410750,0.385000,0.500000,No


In [204]:
players

['Alaa Abdelnaby',
 'Zaid Abdul-Aziz',
 'Kareem Abdul-Jabbar',
 'Mahmoud Abdul-Rauf',
 'Tariq Abdul-Wahad',
 'Shareef Abdur-Rahim',
 'Tom Abernethy',
 'Forest Able',
 'Alex Abrines',
 'Alex Acker',
 'Don Ackerman',
 'Mark Acres',
 'Bud Acton',
 'Quincy Acy',
 'Alvan Adams',
 'Don Adams',
 'Hassan Adams',
 'Jordan Adams',
 'Michael Adams',
 'Steven Adams',
 'Rafael Addison',
 'Rick Adelman',
 'Jeff Adrien',
 'Arron Afflalo',
 'Maurice Ager',
 'Mark Aguirre',
 'Blake Ahearn',
 'Danny Ainge',
 'Alexis Ajinca',
 'Henry Akin',
 'Josh Akognon',
 'Solomon Alabi',
 'Mark Alarie',
 'Gary Alcorn',
 'Furkan Aldemir',
 'Cole Aldrich',
 'LaMarcus Aldridge',
 'Chuck Aleksinas',
 'Cliff Alexander',
 'Cory Alexander',
 'Courtney Alexander',
 'Gary Alexander',
 'Joe Alexander',
 'Victor Alexander',
 'Steve Alford',
 'Bob Allen',
 'Jerome Allen',
 'Lavoy Allen',
 'Lucius Allen',
 'Malik Allen',
 'Randy Allen',
 'Ray Allen',
 'Tony Allen',
 'Odis Allison',
 'Lance Allred',
 'Darrell Allums',
 'Morris Alm

In [207]:
all_nba_url = 'https://www.nba.com/history/awards/all-nba-team'
all_nba_page = requests.get(all_nba_url)
all_nba_page.content

b'\n\n\n<!DOCTYPE html><html class="" lang="en" dir="ltr"><head><link rel="dns-prefetch" href="//fastlane.rubiconproject.com"><link rel="dns-prefetch" href="//optimized-by.rubiconproject.com"><link rel="dns-prefetch" href="//ads.rubiconproject.com"><link rel="dns-prefetch" href="//aax.amazon-adsystem.com"><link rel="dns-prefetch" href="//c.amazon-adsystem.com"><link rel="dns-prefetch" href="//rtax.criteo.com"><meta charset="utf-8" />\n<meta name="title" property="title" content="Year-by-year All-NBA Teams | NBA.com" />\n<link rel="canonical" href="http://www.nba.com/history/awards/all-nba-team" />\n<meta name="Generator" content="Drupal 8 (https://www.drupal.org)" />\n<meta name="MobileOptimized" content="width" />\n<meta name="HandheldFriendly" content="true" />\n<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n<meta name="viewport" content="width=device-width" />\n<meta content="ie=edge, chrome=1" http-equiv="x-ua-compatible" /><script type="text/javascript">

In [208]:
soup = bs4.BeautifulSoup(all_nba_page.content, 'html.parser')
soup


<!DOCTYPE html>
<html class="" dir="ltr" lang="en"><head><link href="//fastlane.rubiconproject.com" rel="dns-prefetch"/><link href="//optimized-by.rubiconproject.com" rel="dns-prefetch"/><link href="//ads.rubiconproject.com" rel="dns-prefetch"/><link href="//aax.amazon-adsystem.com" rel="dns-prefetch"/><link href="//c.amazon-adsystem.com" rel="dns-prefetch"/><link href="//rtax.criteo.com" rel="dns-prefetch"/><meta charset="utf-8"/>
<meta content="Year-by-year All-NBA Teams | NBA.com" name="title" property="title"/>
<link href="http://www.nba.com/history/awards/all-nba-team" rel="canonical">
<meta content="Drupal 8 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="width=device-width" name="viewport"/>
<meta content="ie=edge, chrome=1" http-equiv="x-ua-compatible"/><script type="text/javascript">(window.NREUM||(NR

In [213]:
forward_tag = soup.find('strong', string="F:")

In [215]:
forward_tag.next_sibling

'\xa0Giannis Antetokounmpo, Milwaukee Bucks'

In [224]:
soup.find('h3', string='1946-47').next_sibling

<p><strong><u>FIRST TEAM</u></strong></p>

In [259]:
test_year = soup.find('h3', string='2016-17')

In [266]:
all_nba_dict = {'FIRST TEAM': [], 'SECOND TEAM': [], 'THIRD TEAM': []}
team = ''
for sibling in test_year.next_siblings:
    if sibling.string and sibling.string in ['FIRST TEAM', 'SECOND TEAM', 'THIRD TEAM']:
        team = sibling.string
        for tag in sibling.next_sibling.children:
            if tag.string and tag.string.strip() in ['C:', 'F:', 'G:']:
                continue
            elif tag.string and tag.string.strip():
                if tag.string.strip().find(',') != -1:
                    end = tag.string.strip().find(',')
                    all_nba_dict[team].append(tag.string.strip()[0:end])
                else: 
                    all_nba_dict[team].append(tag.string.strip())

In [267]:
all_nba_dict

{'FIRST TEAM': ['LeBron James',
  'Kawhi Leonard',
  'Anthony Davis',
  'James Harden',
  'Russell Westbrook',
  'LeBron James',
  'Kawhi Leonard',
  'DeAndre Jordan',
  'Stephen Curry',
  'Russell Westbrook',
  'LeBron James',
  'Anthony Davis',
  'Marc Gasol',
  'Stephen Curry',
  'James Harden',
  'Kevin Durant',
  'LeBron James',
  'Joakim Noah',
  'James Harden',
  'Chris Paul',
  'LeBron James',
  'Kevin Durant',
  'Tim Duncan',
  'Kobe Bryant',
  'Chris Paul',
  'Kevin Durant',
  'LeBron James',
  'Dwight Howard',
  'Kobe Bryant',
  'Derrick Rose',
  'Kevin Durant',
  'LeBron James',
  'Dwight Howard',
  'Kobe Bryant',
  'Derrick Rose',
  'Kevin Durant',
  'LeBron James',
  'Dwight Howard',
  'Kobe Bryant',
  'Dwyane Wade',
  'Dirk Nowitzki',
  'LeBron James',
  'Dwight Howard',
  'Kobe Bryant',
  'Dwyane Wade',
  'Kevin Garnett',
  'LeBron James',
  'Dwight Howard',
  'Kobe Bryant',
  'Chris Paul',
  'Tim Duncan',
  'Dirk Nowitzki',
  "Amar'e Stoudemire",
  'Kobe Bryant',
  'St

In [258]:
seasons_df[seasons_df['Year'] == 2017]

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
24096,2017.0,Alex Abrines,SG,23.0,OKC,68.0,6.0,1055.0,10.1,0.560,...,0.898,18.0,68.0,86.0,40.0,37.0,8.0,33.0,114.0,406.0
24097,2017.0,Quincy Acy,PF,26.0,TOT,38.0,1.0,558.0,11.8,0.565,...,0.750,20.0,95.0,115.0,18.0,14.0,15.0,21.0,67.0,222.0
24098,2017.0,Quincy Acy,PF,26.0,DAL,6.0,0.0,48.0,-1.4,0.355,...,0.667,2.0,6.0,8.0,0.0,0.0,0.0,2.0,9.0,13.0
24099,2017.0,Quincy Acy,PF,26.0,BRK,32.0,1.0,510.0,13.1,0.587,...,0.754,18.0,89.0,107.0,18.0,14.0,15.0,19.0,58.0,209.0
24100,2017.0,Steven Adams,C,23.0,OKC,80.0,80.0,2389.0,16.5,0.589,...,0.611,282.0,333.0,615.0,86.0,88.0,78.0,146.0,195.0,905.0
24101,2017.0,Arron Afflalo,SG,31.0,SAC,61.0,45.0,1580.0,9.0,0.559,...,0.892,9.0,116.0,125.0,78.0,21.0,7.0,42.0,104.0,515.0
24102,2017.0,Alexis Ajinca,C,28.0,NOP,39.0,15.0,584.0,12.9,0.529,...,0.725,46.0,131.0,177.0,12.0,20.0,22.0,31.0,77.0,207.0
24103,2017.0,Cole Aldrich,C,28.0,MIN,62.0,0.0,531.0,12.7,0.549,...,0.682,51.0,107.0,158.0,25.0,25.0,23.0,17.0,85.0,105.0
24104,2017.0,LaMarcus Aldridge,PF,31.0,SAS,72.0,72.0,2335.0,18.6,0.532,...,0.812,174.0,350.0,524.0,139.0,46.0,89.0,98.0,158.0,1243.0
24105,2017.0,Lavoy Allen,PF,27.0,IND,61.0,5.0,871.0,11.6,0.485,...,0.697,105.0,115.0,220.0,57.0,18.0,24.0,29.0,78.0,177.0


Player
Alaa Abdelnaby         0
Zaid Abdul-Aziz        0
Kareem Abdul-Jabbar    0
Mahmoud Abdul-Rauf     0
Tariq Abdul-Wahad      0
Shareef Abdur-Rahim    0
Tom Abernethy          0
Forest Able            0
Alex Abrines           0
Alex Acker             0
Don Ackerman           0
Mark Acres             0
Bud Acton              0
Quincy Acy             0
Alvan Adams            0
Don Adams              0
Hassan Adams           0
Jordan Adams           0
Michael Adams          0
Steven Adams           0
Rafael Addison         0
Rick Adelman           0
Jeff Adrien            0
Arron Afflalo          0
Maurice Ager           0
Mark Aguirre           0
Blake Ahearn           0
Danny Ainge            0
Alexis Ajinca          0
Henry Akin             0
                      ..
Charlie Yelverton      0
Rich Yonakor           0
Danny Young            0
James Young            0
Joe Young              0
Korleone Young         0
Michael Young          0
Nick Young             0
Perry Young       

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0
5,1950.0,Ed Bartels,F,24.0,NYK,2.0,,,,0.376,...,0.667,,,,0.0,,,,2.0,4.0
6,1950.0,Ralph Beard,G,22.0,INO,60.0,,,,0.422,...,0.762,,,,233.0,,,,132.0,895.0
7,1950.0,Gene Berce,G-F,23.0,TRI,3.0,,,,0.275,...,0.000,,,,2.0,,,,6.0,10.0
8,1950.0,Charlie Black,F-C,28.0,TOT,65.0,,,,0.346,...,0.651,,,,163.0,,,,273.0,661.0
9,1950.0,Charlie Black,F-C,28.0,FTW,36.0,,,,0.362,...,0.632,,,,75.0,,,,140.0,382.0


In [273]:
dataset_df.drop(['NBA First Team'], axis=1, inplace=True)

In [274]:
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,G,GS,MP,OWS,DWS,WS,FG,...,WS/48,OBPM,DBPM,BPM,FG%,3P%,2P%,eFG%,FT%,Hall of Fame
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,385.0,105.0,5017.0,1.0,6.2,7.1,983.0,...,-0.004000,-5.744444,-2.133333,-7.888889,0.449333,0.000000,0.451444,0.449333,0.671375,No
Zaid Abdul-Aziz,1969,1978,C-F,570.0,0.0,12126.0,4.8,12.5,17.4,1936.0,...,0.045000,-2.171429,-0.328571,-2.485714,0.399500,,0.399500,0.399500,0.674714,No
Kareem Abdul-Jabbar,1970,1989,C,1560.0,625.0,57446.0,179.0,94.5,273.3,15837.0,...,0.220500,3.518750,1.825000,5.331250,0.558350,0.033300,0.558750,0.558350,0.727700,Yes
Mahmoud Abdul-Rauf,1991,2001,G,586.0,336.0,15628.0,16.9,8.4,25.1,3514.0,...,0.061556,0.333333,-3.377778,-3.055556,0.439778,0.316444,0.458333,0.466889,0.893111,No
Tariq Abdul-Wahad,1998,2003,F,321.0,213.0,6826.0,-0.6,6.2,5.5,1049.0,...,0.015200,-3.520000,0.450000,-3.060000,0.369000,0.291333,0.373700,0.373700,0.617900,No
Shareef Abdur-Rahim,1997,2008,F,915.0,760.0,31566.0,56.3,22.9,79.2,5935.0,...,0.111000,0.621429,-0.421429,0.207143,0.454929,0.286538,0.462357,0.460929,0.829500,No
Tom Abernethy,1977,1981,F,358.0,0.0,5732.0,8.6,5.1,13.7,749.0,...,0.087714,-1.628571,-0.114286,-1.742857,0.452000,0.000000,0.454286,0.452000,0.691286,No
Forest Able,1957,1957,G,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-2.062000,,,,0.000000,,0.000000,0.000000,,No
Alex Abrines,2017,2018,G-F,68.0,6.0,1055.0,1.2,0.9,2.1,134.0,...,0.095000,-0.300000,-2.200000,-2.500000,0.393000,0.381000,0.426000,0.531000,0.898000,No
Alex Acker,2006,2009,G,55.0,0.0,433.0,-0.4,0.1,-0.2,64.0,...,-0.061250,-5.200000,-1.525000,-6.750000,0.352250,0.247000,0.410750,0.385000,0.500000,No


In [278]:
dataset_df['position']['George Karl'] = 'G'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [282]:
dataset_df.fillna(0, inplace=True)

In [283]:
dataset_df.isna().sum()

year_start      0
year_end        0
position        0
G               0
GS              0
MP              0
OWS             0
DWS             0
WS              0
FG              0
FGA             0
3P              0
3PA             0
2P              0
2PA             0
FT              0
FTA             0
ORB             0
DRB             0
TRB             0
AST             0
STL             0
BLK             0
TOV             0
PF              0
PTS             0
PER             0
TS%             0
3PAr            0
FTr             0
ORB%            0
DRB%            0
TRB%            0
AST%            0
STL%            0
BLK%            0
TOV%            0
USG%            0
WS/48           0
OBPM            0
DBPM            0
BPM             0
FG%             0
3P%             0
2P%             0
eFG%            0
FT%             0
Hall of Fame    0
dtype: int64

In [284]:
dataset_df['position'].astype('category')

Player
Alaa Abdelnaby         F-C
Zaid Abdul-Aziz        C-F
Kareem Abdul-Jabbar      C
Mahmoud Abdul-Rauf       G
Tariq Abdul-Wahad        F
Shareef Abdur-Rahim      F
Tom Abernethy            F
Forest Able              G
Alex Abrines           G-F
Alex Acker               G
Don Ackerman             G
Mark Acres             F-C
Bud Acton                F
Quincy Acy               F
Alvan Adams            C-F
Don Adams                F
Hassan Adams             G
Jordan Adams             G
Michael Adams            G
Steven Adams             C
Rafael Addison         F-G
Rick Adelman             G
Jeff Adrien              F
Arron Afflalo            G
Maurice Ager             G
Mark Aguirre           F-G
Blake Ahearn             G
Danny Ainge              G
Alexis Ajinca            C
Henry Akin             C-F
                      ... 
Charlie Yelverton      G-F
Rich Yonakor             F
Danny Young              G
James Young            G-F
Joe Young                G
Korleone Young       

In [285]:
dummy_df = pd.get_dummies(dataset_df['position'], drop_first=True)
dummy_df

Unnamed: 0_level_0,C-F,F,F-C,F-G,G,G-F
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alaa Abdelnaby,0,0,1,0,0,0
Zaid Abdul-Aziz,1,0,0,0,0,0
Kareem Abdul-Jabbar,0,0,0,0,0,0
Mahmoud Abdul-Rauf,0,0,0,0,1,0
Tariq Abdul-Wahad,0,1,0,0,0,0
Shareef Abdur-Rahim,0,1,0,0,0,0
Tom Abernethy,0,1,0,0,0,0
Forest Able,0,0,0,0,1,0
Alex Abrines,0,0,0,0,0,1
Alex Acker,0,0,0,0,1,0


In [287]:
dataset_df = pd.concat([dataset_df, dummy_df], axis=1)
dataset_df.drop(['position'], axis=1, inplace=True)
dataset_df

Unnamed: 0_level_0,year_start,year_end,G,GS,MP,OWS,DWS,WS,FG,FGA,...,2P%,eFG%,FT%,Hall of Fame,C-F,F,F-C,F-G,G,G-F
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,385.0,105.0,5017.0,1.0,6.2,7.1,983.0,1940.0,...,0.451444,0.449333,0.671375,No,0,0,1,0,0,0
Zaid Abdul-Aziz,1969,1978,570.0,0.0,12126.0,4.8,12.5,17.4,1936.0,4588.0,...,0.399500,0.399500,0.674714,No,1,0,0,0,0,0
Kareem Abdul-Jabbar,1970,1989,1560.0,625.0,57446.0,179.0,94.5,273.3,15837.0,28307.0,...,0.558750,0.558350,0.727700,Yes,0,0,0,0,0,0
Mahmoud Abdul-Rauf,1991,2001,586.0,336.0,15628.0,16.9,8.4,25.1,3514.0,7943.0,...,0.458333,0.466889,0.893111,No,0,0,0,0,1,0
Tariq Abdul-Wahad,1998,2003,321.0,213.0,6826.0,-0.6,6.2,5.5,1049.0,2519.0,...,0.373700,0.373700,0.617900,No,0,1,0,0,0,0
Shareef Abdur-Rahim,1997,2008,915.0,760.0,31566.0,56.3,22.9,79.2,5935.0,12569.0,...,0.462357,0.460929,0.829500,No,0,1,0,0,0,0
Tom Abernethy,1977,1981,358.0,0.0,5732.0,8.6,5.1,13.7,749.0,1531.0,...,0.454286,0.452000,0.691286,No,0,1,0,0,0,0
Forest Able,1957,1957,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.000000,0.000000,0.000000,No,0,0,0,0,1,0
Alex Abrines,2017,2018,68.0,6.0,1055.0,1.2,0.9,2.1,134.0,341.0,...,0.426000,0.531000,0.898000,No,0,0,0,0,0,1
Alex Acker,2006,2009,55.0,0.0,433.0,-0.4,0.1,-0.2,64.0,168.0,...,0.410750,0.385000,0.500000,No,0,0,0,0,1,0


In [288]:
dataset_df.columns

Index(['year_start', 'year_end', 'G', 'GS', 'MP', 'OWS', 'DWS', 'WS', 'FG',
       'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'WS/48',
       'OBPM', 'DBPM', 'BPM', 'FG%', '3P%', '2P%', 'eFG%', 'FT%',
       'Hall of Fame', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F'],
      dtype='object')

In [292]:
X = dataset_df.drop(['Hall of Fame'], axis=1)
y = dataset_df['Hall of Fame']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
random_state = 4
classifiers = [LogisticRegression(random_state=random_state), 
               SVC(random_state=random_state), 
               DecisionTreeClassifier(random_state=random_state)]
params = [
    {'clf__C':[0.25, 0.50, 0.75, 1.0],
     'clf__class_weight': [None, 'balanced', {'Yes':2, 'No':1}, {'Yes':10, 'No':1}, {'Yes':100, 'No':1}, {'Yes':1000, 'No':1}]},
    
    {'clf__kernel': ['poly', 'sigmoid'],
     'clf__C': [0.01, 1, 100],
     'clf__degree': [2,3,4,5],
     'clf__gamma': [0.001, 0.01]},
    
    {'clf__criterion':['gini','entropy'],
     'clf__max_depth':[1,2,3,4],
     'clf__min_impurity_decrease':[0, 0.25, 0.50, 0.75]}
]

In [295]:
pipelines = []
best_clfs = []
for clf,param in zip(classifiers,params):
    pipe = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', clf)])
    gs = GridSearchCV(estimator=pipe,
                  param_grid=param,
                  scoring='f1_weighted',
                  cv=3)
    gs.fit(X_train, y_train)
    best_clfs.append(gs.best_estimator_)
    pipelines.append(pipe)  
    print(clf.__class__.__name__)
    print('Best params:', gs.best_params_)
    print('Train Accuracy:',gs.best_estimator_.score(X_train, y_train))
    print('Test Accuracy:',gs.best_estimator_.score(X_test, y_test))
    print('')

LogisticRegression
Best params: {'clf__C': 0.5, 'clf__class_weight': {'Yes': 2, 'No': 1}}
Train Accuracy: 0.9714467005076142
Test Accuracy: 0.9758883248730964

SVC
Best params: {'clf__C': 100, 'clf__degree': 3, 'clf__gamma': 0.01, 'clf__kernel': 'poly'}
Train Accuracy: 0.9739847715736041
Test Accuracy: 0.9720812182741116

DecisionTreeClassifier
Best params: {'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__min_impurity_decrease': 0}
Train Accuracy: 0.9803299492385786
Test Accuracy: 0.9720812182741116



In [296]:
best_clfs

[Pipeline(memory=None,
          steps=[('scl',
                  StandardScaler(copy=True, with_mean=True, with_std=True)),
                 ('pca',
                  PCA(copy=True, iterated_power='auto', n_components=2,
                      random_state=None, svd_solver='auto', tol=0.0,
                      whiten=False)),
                 ('clf',
                  LogisticRegression(C=0.5, class_weight={'No': 1, 'Yes': 2},
                                     dual=False, fit_intercept=True,
                                     intercept_scaling=1, l1_ratio=None,
                                     max_iter=100, multi_class='auto',
                                     n_jobs=None, penalty='l2', random_state=4,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False), Pipeline(memory=None,
          steps=[('scl',
                  StandardScaler(copy=True, with_mean=True, with_std