# Import Statements 

In [189]:
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup 
import requests
from IPython.display import Image
from IPython.core.display import HTML 

In [190]:
#Essentially what this cell is doing is getting the HTML from the specific link 
url = 'https://en.wikipedia.org/wiki/List_of_world_records_in_swimming'
page = requests.get(url)
html_text = BeautifulSoup(page.text, "html") #gets the actual HTML text from the website

# This is a title haha. Remember to make this a markdown cell and use '#'

## This is a subtitle. Use '##'

### This is a subsubtitle. Use '###"

"<.br>" you can also use to <br> add line breaks (without the period of course)

# Getting all the HTML text for our specific table. 

Both of the methods below accomplish the same thing. <br>
Recall that __find( )__ returns the first instance, and __find_all( )__ returns a list of all instance 

In [191]:
table = html_text.find_all("table", class_ = "wikitable sortable") #or
table = html_text.find_all("table")[0]

## Getting all the titles of our columns

In [192]:
col_titles = table.find_all("th")
col_titles

[<th>Event
 </th>,
 <th class="unsortable" style="width:4em">Time
 </th>,
 <th class="unsortable">
 </th>,
 <th>Name</th>,
 <th>Nationality</th>,
 <th>Date</th>,
 <th>Meet</th>,
 <th>Location
 </th>,
 <th class="unsortable" style="width:2em">Ref
 </th>]

### Removes all the excess HTML stuff syntax, and gives us column values which are actually string values 

In [193]:
col_titles = [col_title.text.strip() for col_title in table.find_all("th")]
col_titles

['Event', 'Time', '', 'Name', 'Nationality', 'Date', 'Meet', 'Location', 'Ref']

## Adds all our column titles to the dataframe

In [194]:
swimming_world_record_data_frame = pd.DataFrame(columns = col_titles)
swimming_world_record_data_frame 

Unnamed: 0,Event,Time,Unnamed: 3,Name,Nationality,Date,Meet,Location,Ref


### In the wikipedia table, each row is divided by its "<.tr>" tag. Thus, by finding all the "<.tr>" tags we get all the HTML for each row 

In [195]:
all_row_values = table.find_all("tr")
all_row_values[1]

<tr>
<td><span data-sort-value="01  !"> <a href="/wiki/World_record_progression_50_metres_freestyle" title="World record progression 50 metres freestyle">50m freestyle</a> </span>
</td>
<td style="text-align:right; padding-left:0.5em; padding-right:0.5em;">20.91
</td>
<td style="font-size:smaller">ss</td>
<td><span class="nowrap"><span data-sort-value="Cielo, César"><span class="vcard"><span class="fn"><a href="/wiki/C%C3%A9sar_Cielo" title="César Cielo">César Cielo</a></span></span></span></span>
</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="504" data-file-width="720" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/22px-Flag_of_Brazil.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/33px-Flag_of_Brazil.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/0/05/Flag_of_Brazil.svg/43px-Flag_of_Brazil.svg.png

### We use list comprehension to get all the HTMl within each row, by using its <.td> as each item in the list

The first row is empty, that's why we get all the items from the first index and on

We add all the (cleaned) string representations from each row and add it to the row index of our data frame. 

Recall that __.loc( )__ is used to locate a certain row or column in the dataframe. The default is row. 

In [196]:
for specific_row in all_row_values[1:]: 
    specific_row_data = specific_row.find_all('td')
    individual_row_data = [col_in_row.text.strip() for col_in_row in specific_row_data]
    
    length = len(swimming_world_record_data_frame)
    swimming_world_record_data_frame.loc[length] = individual_row_data

In [197]:
swimming_world_record_data_frame

Unnamed: 0,Event,Time,Unnamed: 3,Name,Nationality,Date,Meet,Location,Ref
0,50m freestyle,20.91,ss,César Cielo,Brazil,18 December 2009,Brazilian Championships,"São Paulo, Brazil",[9][10][11][12]
1,100m freestyle,46.86,,David Popovici,Romania,13 August 2022,European Championships,"Rome, Italy",[13][14]
2,200m freestyle,1:42.00,ss,Paul Biedermann,Germany,28 July 2009,World Championships,"Rome, Italy",[15][16][17]
3,400m freestyle,3:40.07,ss,Paul Biedermann,Germany,26 July 2009,World Championships,"Rome, Italy",[18][19][20]
4,800m freestyle,7:32.12,ss,Zhang Lin,China,29 July 2009,World Championships,"Rome, Italy",[21][22]
5,1500m freestyle,14:31.02,,Sun Yang,China,4 August 2012,Olympic Games,"London, United Kingdom",[23][24]
6,50m backstroke,23.71,,Hunter Armstrong,United States,28 April 2022,U.S. International Team Trials,"Greensboro, United States",[25][26]
7,50m backstroke,23.55,"sf, #",Kliment Kolesnikov,Russia,27 July 2023,Russian Cup,"Kazan, Russia",[27]
8,100m backstroke,51.60,,Thomas Ceccon,Italy,20 June 2022,World Championships,"Budapest, Hungary",[29][30]
9,200m backstroke,1:51.92,ss,Aaron Peirsol,United States,31 July 2009,World Championships,"Rome, Italy",[31][32][33]


## Cleaning our data a bit more 

Dropping the column with 'ss' and the references. We can get rid of these by using the __.drop( )__ method in pandas. Then we specify the columns we want to get rid of. Remember that axis = 1 indicates columns, not rows

In [198]:
swimming_world_record_data_frame = swimming_world_record_data_frame.drop(swimming_world_record_data_frame.columns[[2,8]], axis = 1)
swimming_world_record_data_frame = swimming_world_record_data_frame.rename(columns = {"Date": "Date Set"})
swimming_world_record_data_frame

Unnamed: 0,Event,Time,Name,Nationality,Date Set,Meet,Location
0,50m freestyle,20.91,César Cielo,Brazil,18 December 2009,Brazilian Championships,"São Paulo, Brazil"
1,100m freestyle,46.86,David Popovici,Romania,13 August 2022,European Championships,"Rome, Italy"
2,200m freestyle,1:42.00,Paul Biedermann,Germany,28 July 2009,World Championships,"Rome, Italy"
3,400m freestyle,3:40.07,Paul Biedermann,Germany,26 July 2009,World Championships,"Rome, Italy"
4,800m freestyle,7:32.12,Zhang Lin,China,29 July 2009,World Championships,"Rome, Italy"
5,1500m freestyle,14:31.02,Sun Yang,China,4 August 2012,Olympic Games,"London, United Kingdom"
6,50m backstroke,23.71,Hunter Armstrong,United States,28 April 2022,U.S. International Team Trials,"Greensboro, United States"
7,50m backstroke,23.55,Kliment Kolesnikov,Russia,27 July 2023,Russian Cup,"Kazan, Russia"
8,100m backstroke,51.60,Thomas Ceccon,Italy,20 June 2022,World Championships,"Budapest, Hungary"
9,200m backstroke,1:51.92,Aaron Peirsol,United States,31 July 2009,World Championships,"Rome, Italy"


### Yay!!!! We have sucessfully webscraped a table from wikipedia and turned it into a dataframe!

I got the idea from Alex the Analyst on YouTube. Here's the [link!](https://www.youtube.com/watch?v=8dTpNajxaH0&list=LL&index=1&t) to his video :)