In [None]:
# 4 Major Python Libraries for Web Crawling
# (1) Pandas - Parsing HTML Tables
# (2) Request - Parsing HTML Codes
# (3) BeautifulSoup - Analyzing HTML Codes
# (4) Selenium - Automating Browser Activities

# All labs in these lessons are meant for demonstrating web crawling techniques only.
# Please Google and try to understand in details the ethics and best practice for web crawling.
# e.g. https://sunscrapers.com/blog/web-crawling-scraping-best-practices/

In [2]:
# Install all these libraries to your environment
import pandas as pd

In [3]:
# Read CSV file online
nikkei_url = "https://query1.finance.yahoo.com/v7/finance/download/%5EN225?period1=-157420800&period2=1636243200&interval=1d&events=history&includeAdjustedClose=true"
nikkei_df = pd.read_csv(nikkei_url)
nikkei_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1965-01-05,1257.719971,1257.719971,1257.719971,1257.719971,1257.719971,0.0
1,1965-01-06,1263.989990,1263.989990,1263.989990,1263.989990,1263.989990,0.0
2,1965-01-07,1274.270020,1274.270020,1274.270020,1274.270020,1274.270020,0.0
3,1965-01-08,1286.430054,1286.430054,1286.430054,1286.430054,1286.430054,0.0
4,1965-01-11,,,,,,
...,...,...,...,...,...,...,...
14600,2021-10-29,28819.160156,29000.650391,28475.060547,28892.689453,28892.689453,104500000.0
14601,2021-11-01,29330.679688,29666.830078,29267.630859,29647.080078,29647.080078,76700000.0
14602,2021-11-02,29462.400391,29599.570313,29457.179688,29520.900391,29520.900391,71100000.0
14603,2021-11-04,29859.740234,29880.810547,29718.779297,29794.369141,29794.369141,90400000.0


In [4]:
# Read tables online, returning array of tables
news_url = "https://www.skysports.com/champions-league-table"
news_df = pd.read_html(news_url)
news_df

[   #               Team  Pl  W  D  L   F   A  GD  Pts  Last 6
 0  1      Bayern Munich   6  5  1  0  12   6   6   16     NaN
 1  2      FC Copenhagen   6  2  2  2   8   8   0    8     NaN
 2  3        Galatasaray   6  1  2  3  10  13  -3    5     NaN
 3  4  Manchester United   6  1  1  4  12  15  -3    4     NaN,
    #           Team  Pl  W  D  L   F   A  GD  Pts  Last 6
 0  1        Arsenal   6  4  1  1  16   4  12   13     NaN
 1  2  PSV Eindhoven   6  2  3  1   8  10  -2    9     NaN
 2  3        RC Lens   6  2  2  2   6  11  -5    8     NaN
 3  4        Sevilla   6  0  2  4   7  12  -5    2     NaN,
    #                Team  Pl  W  D  L   F   A  GD  Pts  Last 6
 0  1         Real Madrid   6  6  0  0  16   7   9   18     NaN
 1  2              Napoli   6  3  1  2  10   9   1   10     NaN
 2  3               Braga   6  1  1  4   6  12  -6    4     NaN
 3  4  1. FC Union Berlin   6  0  2  4   6  10  -4    2     NaN,
    #               Team  Pl  W  D  L  F   A  GD  Pts  Last 6
 0  1

In [5]:
len(news_df)

8

In [6]:
news_df[0]

Unnamed: 0,#,Team,Pl,W,D,L,F,A,GD,Pts,Last 6
0,1,Bayern Munich,6,5,1,0,12,6,6,16,
1,2,FC Copenhagen,6,2,2,2,8,8,0,8,
2,3,Galatasaray,6,1,2,3,10,13,-3,5,
3,4,Manchester United,6,1,1,4,12,15,-3,4,


<h2>Technique - Data Cleansing</h2>

In [7]:
# Removing unnecessary columns - method 1
news_df_0_a = news_df[0].drop(["#", "Last 6"], axis=1)
news_df_0_a

Unnamed: 0,Team,Pl,W,D,L,F,A,GD,Pts
0,Bayern Munich,6,5,1,0,12,6,6,16
1,FC Copenhagen,6,2,2,2,8,8,0,8
2,Galatasaray,6,1,2,3,10,13,-3,5
3,Manchester United,6,1,1,4,12,15,-3,4


In [8]:
# Removing unnecessary columns - method 2
news_df_0_b = news_df[0][["Team", "Pl", "W", "D", "L", "F", "A", "GD", "Pts"]]
news_df_0_b

Unnamed: 0,Team,Pl,W,D,L,F,A,GD,Pts
0,Bayern Munich,6,5,1,0,12,6,6,16
1,FC Copenhagen,6,2,2,2,8,8,0,8
2,Galatasaray,6,1,2,3,10,13,-3,5
3,Manchester United,6,1,1,4,12,15,-3,4
