# Scraping Data from a Website using Python/Pandas

- web scraper using python and pandas, took data from Wikipedia page
- Data: NBA's all time points list 
- url: https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_career_scoring_leaders


In [75]:
#use beautiful soup to scrape 
from bs4 import BeautifulSoup 
import requests

In [76]:
#url for website 
url = 'https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_career_scoring_leaders'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [77]:
#list all the tables in the webpage with class as wikitable sortable
soup.find('table', class_ = 'wikitable sortable')





In [74]:
#find appropriate table from the url, table I want is second in the list
table = soup.find_all('table')[1]


In [71]:
#pull out all the headers of the table 
world_titles = table.find_all('th')
print(world_titles)

[<th>Rank
</th>, <th>Player
</th>, <th><a class="mw-redirect" href="/wiki/Basketball_position" title="Basketball position">Pos</a>
</th>, <th class="unsortable">Team(s) played for (years)<sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[b]</a></sup>
</th>, <th>Total <a href="/wiki/Point_(basketball)" title="Point (basketball)">points</a>
</th>, <th>Games <br/> played
</th>, <th data-sort-type="number"><a href="/wiki/Points_per_game" title="Points per game">Points <br/> per game</a><sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[c]</a></sup>
</th>, <th><a href="/wiki/Field_goal_(basketball)" title="Field goal (basketball)">Field goals</a><br/>made
</th>, <th><a href="/wiki/Three-point_field_goal" title="Three-point field goal">Three-point<br/>field goals</a><br/>made<sup class="reference" id="cite_ref-7"><a href="#cite_note-7">[d]</a></sup>
</th>, <th><a href="/wiki/Free_throw" title="Free throw">Free <br/> throws</a><br/> made
</th>]


In [37]:
#get the text of the headers 
world_table_titles = [title.text.strip() for title in world_titles]

print(world_table_titles)

['Rank', 'Player', 'Pos', 'Team(s) played for (years)[b]', 'Total points', 'Games  played', 'Points  per game[c]', 'Field goalsmade', 'Three-pointfield goalsmade[d]', 'Free  throws made']


In [39]:
#import pandas to make dataframe
import pandas as pd

In [40]:
df = pd.DataFrame(columns = world_table_titles)

df

Unnamed: 0,Rank,Player,Pos,Team(s) played for (years)[b],Total points,Games played,Points per game[c],Field goalsmade,Three-pointfield goalsmade[d],Free throws made


In [43]:
col_data = table.find_all('tr')

In [48]:
for row in col_data[1:]: 
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data
    


In [49]:
df

Unnamed: 0,Rank,Player,Pos,Team(s) played for (years)[b],Total points,Games played,Points per game[c],Field goalsmade,Three-pointfield goalsmade[d],Free throws made
0,1,LeBron James^,SF,"Cleveland Cavaliers (2003–2010, 2014–2018)Miam...",38652,1421,27.2,14152,2261,8087
1,2,Kareem Abdul-Jabbar*,C,Milwaukee Bucks (1969–1975)Los Angeles Lakers ...,38387,1560,24.6,15837,1,6712
2,3,Karl Malone*,PF,Utah Jazz (1985–2003)Los Angeles Lakers (2003–...,36928,1476,25.0,13528,85,9787
3,4,Kobe Bryant*,SG,Los Angeles Lakers (1996–2016),33643,1346,25.0,11719,1827,8378
4,5,Michael Jordan*,SG,"Chicago Bulls (1984–1993, 1995–1998)Washington...",32292,1072,30.1,12192,581,7327
...,...,...,...,...,...,...,...,...,...,...
95,46,LaMarcus Aldridge†,PF/C,Portland Trail Blazers (2006–2015)San Antonio ...,20558,1076,19.1,8311,227,3709
96,47,Mitch Richmond*,SG,Golden State Warriors (1988–1991)Sacramento Ki...,20497,976,21.0,7305,1326,4561
97,48,Joe Johnson†,SG/SF,"Boston Celtics (2001–2002, 2021–2022)Phoenix S...",20407,1277,16.0,7823,1978,2783
98,49,Tom Chambers,PF,San Diego Clippers (1981–1983)Seattle SuperSon...,20049,1107,18.1,7378,227,5066


In [54]:
#rename columns 
df = df.rename(columns= {'Team(s) played for (years)[b]':'Team(s) played for (years)', 
                         'Points per game[c]': 'Points per game', 
                         'Three-pointfield goalsmade[d]': 'Three-pointers Made', 
                         'Field goalsmade':'Field goals Made'})
df

Unnamed: 0,Rank,Player,Pos,Team(s) played for (years),Total points,Games played,Points per game[c],Field goals Made,Three-pointers Made,Free throws made
0,1,LeBron James^,SF,"Cleveland Cavaliers (2003–2010, 2014–2018)Miam...",38652,1421,27.2,14152,2261,8087
1,2,Kareem Abdul-Jabbar*,C,Milwaukee Bucks (1969–1975)Los Angeles Lakers ...,38387,1560,24.6,15837,1,6712
2,3,Karl Malone*,PF,Utah Jazz (1985–2003)Los Angeles Lakers (2003–...,36928,1476,25.0,13528,85,9787
3,4,Kobe Bryant*,SG,Los Angeles Lakers (1996–2016),33643,1346,25.0,11719,1827,8378
4,5,Michael Jordan*,SG,"Chicago Bulls (1984–1993, 1995–1998)Washington...",32292,1072,30.1,12192,581,7327
...,...,...,...,...,...,...,...,...,...,...
95,46,LaMarcus Aldridge†,PF/C,Portland Trail Blazers (2006–2015)San Antonio ...,20558,1076,19.1,8311,227,3709
96,47,Mitch Richmond*,SG,Golden State Warriors (1988–1991)Sacramento Ki...,20497,976,21.0,7305,1326,4561
97,48,Joe Johnson†,SG/SF,"Boston Celtics (2001–2002, 2021–2022)Phoenix S...",20407,1277,16.0,7823,1978,2783
98,49,Tom Chambers,PF,San Diego Clippers (1981–1983)Seattle SuperSon...,20049,1107,18.1,7378,227,5066


In [65]:
#export into csv file 
df.to_csv('TopNBAPlayerPoints.csv', index = False)