Web Scraping using Python (and Beautiful Soup)

Web Scraping using Beautiful Soup

Using Jupyter Notebook, you should start by importing the necessary modules (pandas, numpy, matplotlib.pyplot, seaborn). If you don't have Jupyter Notebook installed, I recommend installing it using the Anaconda Python distribution which is available on the internet. To easily display the plots, make sure to include the line %matplotlib inline as shown below.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


In [None]:
url = "http://www.hubertiming.com/results/2017GPTR10K"
html = urlopen(url)


In [None]:
soup = BeautifulSoup(html, 'lxml')
type(soup)


In [None]:
bs4.BeautifulSoup


In [None]:
# Get the title
title = soup.title
print(title)


In [None]:
# Print out the text
text = soup.get_text()
#print(soup.text)


In [None]:
soup.find_all('a')


In [None]:
all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))


In [None]:
# Print the first 10 rows for sanity check
rows = soup.find_all('tr')
print(rows[:10])


In [None]:
for row in rows:
    row_td = row.find_all('td')
print(row_td)
type(row_td)


In [None]:
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
print(cleantext)


In [None]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
print(clean2)
type(clean2)


In [None]:
df = pd.DataFrame(list_rows)
df.head(10)


In [None]:
df1 = df[0].str.split(',', expand=True)
df1.head(10)


In [None]:
df1[0] = df1[0].str.strip('[')
df1.head(10)


In [None]:
col_labels = soup.find_all('th')


In [None]:
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)


In [None]:
df2 = pd.DataFrame(all_header)
df2.head()


In [None]:
df3 = df2[0].str.split(',', expand=True)
df3.head()


In [None]:
frames = [df3, df1]

df4 = pd.concat(frames)
df4.head(10)


In [None]:
df5 = df4.rename(columns=df4.iloc[0])
df5.head()


In [None]:
df5.info()
df5.shape


In [None]:
df6 = df5.dropna(axis=0, how='any')


In [None]:
df7 = df6.drop(df6.index[0])
df7.head()


In [None]:
df7.rename(columns={'[Place': 'Place'},inplace=True)
df7.rename(columns={' Team]': 'Team'},inplace=True)
df7.head()


In [None]:
df7['Team'] = df7['Team'].str.strip(']')
df7.head()


In [None]:
time_list = df7[' Chip Time'].tolist()

# You can use a for loop to convert 'Chip Time' to minutes

time_mins = []
for i in time_list:
    h, m, s = i.split(':')
    math = (int(h) * 3600 + int(m) * 60 + int(s))/60
    time_mins.append(math)
#print(time_mins)


In [None]:
df7['Runner_mins'] = time_mins
df7.head()


In [None]:
df7.describe(include=[np.number])


In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5


In [None]:
df7.boxplot(column='Runner_mins')
plt.grid(True, axis='y')
plt.ylabel('Chip Time')
plt.xticks([1], ['Runners'])


In [None]:
x = df7['Runner_mins']
ax = sns.distplot(x, hist=True, kde=True, rug=False, color='m', bins=25, hist_kws={'edgecolor':'black'})
plt.show()


In [None]:
f_fuko = df7.loc[df7[' Gender']==' F']['Runner_mins']
m_fuko = df7.loc[df7[' Gender']==' M']['Runner_mins']
sns.distplot(f_fuko, hist=True, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Female')
sns.distplot(m_fuko, hist=False, kde=True, rug=False, hist_kws={'edgecolor':'black'}, label='Male')
plt.legend()


In [None]:
g_stats = df7.groupby(" Gender", as_index=True).describe()
print(g_stats)


In [None]:
df7.boxplot(column='Runner_mins', by=' Gender')
plt.ylabel('Chip Time')
plt.suptitle("")
