## Data Collection

In [None]:
# Installations:
!pip install tabula-py

# Imports:
import requests
import json
from bs4 import BeautifulSoup
from tabula.io import read_pdf
import pandas as pd

# Google Drive mount authentication
from google.colab import drive
drive.mount('/content/drive')

# 1. Fetch Data From API

## 1.1 Exchange Rate Demo

In [34]:
API_KEY = "7d621c8860a896ffe2d06c2f27317372"
URL = "http://api.exchangeratesapi.io/v1/latest?access_key={}".format(API_KEY)
 
response = requests.get(URL)
print(response)
text = response.text
json_data = json.loads(text)

<Response [200]>


In [35]:
print(json_data)
print(json_data.keys())

{'success': True, 'timestamp': 1624056544, 'base': 'EUR', 'date': '2021-06-18', 'rates': {'AED': 4.357587, 'AFN': 93.133209, 'ALL': 122.135209, 'AMD': 609.464054, 'ANG': 2.129861, 'AOA': 761.637124, 'ARS': 113.137592, 'AUD': 1.586454, 'AWG': 2.136023, 'AZN': 2.021505, 'BAM': 1.947581, 'BBD': 2.395752, 'BDT': 100.559612, 'BGN': 1.958052, 'BHD': 0.447168, 'BIF': 2351.345662, 'BMD': 1.18635, 'BND': 1.591645, 'BOB': 8.193196, 'BRL': 6.0387, 'BSD': 1.186574, 'BTC': 3.3447718e-05, 'BTN': 87.685054, 'BWP': 12.84837, 'BYN': 2.982567, 'BYR': 23252.459629, 'BZD': 2.391769, 'CAD': 1.479165, 'CDF': 2353.718792, 'CHF': 1.095061, 'CLF': 0.032186, 'CLP': 888.106237, 'CNY': 7.65564, 'COP': 4471.353079, 'CRC': 732.706477, 'CUC': 1.18635, 'CUP': 31.438274, 'CVE': 109.945033, 'CZK': 25.581313, 'DJF': 210.838586, 'DKK': 7.436522, 'DOP': 67.717323, 'DZD': 159.362859, 'EGP': 18.56267, 'ERN': 17.797621, 'ETB': 51.373553, 'EUR': 1, 'FJD': 2.44986, 'FKP': 0.840936, 'GBP': 0.858958, 'GEL': 3.754845, 'GGP': 0.84

In [5]:
df = pd.DataFrame({"rate": json_data['rates']})
df.head()

Unnamed: 0,rate
AED,4.405028
AFN,95.691111
ALL,122.959976
AMD,625.041028
ANG,2.175598


In [6]:
df = df.reset_index(inplace=False)
df.head()

Unnamed: 0,index,rate
0,AED,4.405028
1,AFN,95.691111
2,ALL,122.959976
3,AMD,625.041028
4,ANG,2.175598


In [7]:
df = df.rename(columns={'index': 'currency'})
df.head()

Unnamed: 0,currency,rate
0,AED,4.405028
1,AFN,95.691111
2,ALL,122.959976
3,AMD,625.041028
4,ANG,2.175598


In [8]:
df['base_currency'] = json_data['base']
df['date_accessed'] = json_data['date']
df.head()

Unnamed: 0,currency,rate,base_currency,date_accessed
0,AED,4.405028,EUR,2021-06-17
1,AFN,95.691111,EUR,2021-06-17
2,ALL,122.959976,EUR,2021-06-17
3,AMD,625.041028,EUR,2021-06-17
4,ANG,2.175598,EUR,2021-06-17


## 1.2 Exercise: How Many People Are In Space?

Let's use an open API so we do not need to worry about authentication. Check how many people are currently in space using this open API. Be sure to reshape
the resulting JSON into a proper DataFrame. For the URL, please use:
`"http://api.open-notify.org/astros.json"`

In [4]:
# YOUR CODE GOES HERE

In [21]:
URL = "http://api.open-notify.org/astros.json"
response = requests.get(URL)
print(response)

<Response [200]>


In [22]:
text = response.text
json_data = json.loads(text)

In [12]:
df = pd.DataFrame(json_data["people"])

In [14]:
df.head()

Unnamed: 0,name,craft
0,Mark Vande Hei,ISS
1,Oleg Novitskiy,ISS
2,Pyotr Dubrov,ISS
3,Thomas Pesquet,ISS
4,Megan McArthur,ISS


# 2. Fetch Data Via Parse HTML

## 2.1 Video Game Music Demo

We will parse a webpage full or links. This would be of interest if we were harvesting data stored in links from a web-based source. There are important background information on html:
- `<a>`is known as an anchor element in HTML, we usually search for these elements when parsing for links. (`https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a`)
- `href` is an attribue of `<a>` which creates the hyperlink

In [9]:
URL = 'https://www.vgmusic.com/music/console/nintendo/nes/'
response = requests.get(URL)
text = response.text
soup = BeautifulSoup(text, 'html.parser')

In [10]:
soup.find_all("a")[0:10]

[<a href="http://www.vgmusic.com/information/donate.php">Please contribute today</a>,
 <a href="/"><img alt="You are surfing the Videogame Music Archive" border="0" height="60" src="/images/mikey57a.gif" width="468"/></a>,
 <a name="10Yard_Fight">10-Yard Fight</a>,
 <a href="10-Yard_Fight-Kick_Off.mid">Kick Off</a>,
 <a href="/file/debcd7c61535f6aba8d4b88d8d0182db.html#disqus_thread">Comments</a>,
 <a name="1943">1943</a>,
 <a href="1943.mid">"Raid and Pacific Attack" Title Screen Song</a>,
 <a href="/file/c6d8c1b732822f614e3b5892b703c58f.html#disqus_thread">Comments</a>,
 <a href="1943sab.mid">Assault on Surface Forces B</a>,
 <a href="/file/07f297682c9731ad956da14180f2aa65.html#disqus_thread">Comments</a>]

In [11]:
len(soup.find_all("a"))

8851

In [12]:
href_list = [link.get("href") for link in soup.find_all("a")]

# code without using list comprehension:
# href_list = []
# for link in soup.find_all("a"):
#   href_list.append(link.get("href"))

In [13]:
df = pd.DataFrame({"song_list": href_list, "base_url": URL})

In [14]:
df = df[df["song_list"].str.contains(".mid", na=False)]

In [15]:
len(df.song_list)

4204

In [16]:
df['full_path'] = df['base_url'] + df["song_list"]
df.head()

Unnamed: 0,song_list,base_url,full_path
3,10-Yard_Fight-Kick_Off.mid,https://www.vgmusic.com/music/console/nintendo...,https://www.vgmusic.com/music/console/nintendo...
6,1943.mid,https://www.vgmusic.com/music/console/nintendo...,https://www.vgmusic.com/music/console/nintendo...
8,1943sab.mid,https://www.vgmusic.com/music/console/nintendo...,https://www.vgmusic.com/music/console/nintendo...
10,1943-lev1.mid,https://www.vgmusic.com/music/console/nintendo...,https://www.vgmusic.com/music/console/nintendo...
12,43pbos1.mid,https://www.vgmusic.com/music/console/nintendo...,https://www.vgmusic.com/music/console/nintendo...


## 2.2 Exercise: Google Scholar Link Extraction

Let's practice link extraction from html. For the following google scholar link,
extract every link that leads to an academic article. Be sure to put the results into a nice list or DataFrame showing just the necessary links.

Please use the following google scholar link: `"https://scholar.google.ca/scholar?hl=en&as_sdt=0%2C5&q=astroids&btnG="`

In [None]:
# YOUR CODE GOES HERE

In [24]:
URL = "https://scholar.google.ca/scholar?hl=en&as_sdt=0%2C5&q=astroids&btnG="
response = requests.get(URL)
text = response.text
soup = BeautifulSoup(text, 'html.parser')

In [28]:
href_list = [link.get("href") for link in soup.find_all("a")]

In [32]:
filtered_list = [link for link in href_list if "https" in link and "google" not in link]

In [33]:
filtered_list

['https://ui.adsabs.harvard.edu/abs/1989aste.conf..921B/abstract',
 'https://arxiv.org/pdf/1203.4336',
 'https://www.sciencedirect.com/science/article/pii/S0032063312000773',
 'https://ui.adsabs.harvard.edu/abs/1989aste.conf.....B/abstract',
 'https://iopscience.iop.org/article/10.1088/0004-6256/143/3/66/pdf',
 'https://iopscience.iop.org/article/10.1088/0004-6256/143/3/66/meta',
 'https://ui.adsabs.harvard.edu/abs/1989aste.conf..524B/abstract',
 'https://aip.scitation.org/doi/abs/10.1063/1.3636041',
 'https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/JB074i010p02531',
 'https://arxiv.org/pdf/0907.2512',
 'https://www.sciencedirect.com/science/article/pii/S0019103509003029',
 'https://www.sciencedirect.com/science/article/pii/0019103589900158']

# 3. Fetch Data From PDF

In [18]:
URL = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
df_list = read_pdf(URL, output_format="dataframe", pages="all")

In [19]:
df = df_list[0]
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Percent Fuel Savings,Unnamed: 5
0,Cycle,KI,Distance,,,
1,Name,(1/km),(mi),Improved,Decreased Eliminate,Decreased
2,,,,Speed,Accel Stops,Idle
3,2012_2,3.30,1.3,5.9%,9.5% 29.2%,17.4%
4,2145_1,0.68,11.2,2.4%,0.1% 9.5%,2.7%


In [20]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3',
       'Percent Fuel Savings', 'Unnamed: 5'],
      dtype='object')

In [21]:
df['decrease_accel'] = df["Percent Fuel Savings"].str.split(" ").str[0]
df['eliminate_stops'] = df["Percent Fuel Savings"].str.split(" ").str[1]
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Percent Fuel Savings,Unnamed: 5,decrease_accel,eliminate_stops
0,Cycle,KI,Distance,,,,,
1,Name,(1/km),(mi),Improved,Decreased Eliminate,Decreased,Decreased,Eliminate
2,,,,Speed,Accel Stops,Idle,Accel,Stops
3,2012_2,3.30,1.3,5.9%,9.5% 29.2%,17.4%,9.5%,29.2%
4,2145_1,0.68,11.2,2.4%,0.1% 9.5%,2.7%,0.1%,9.5%


In [22]:
df = df[3:7]
df = df.rename(columns={
    "Unnamed: 0": "cycle_name", 
    "Unnamed: 1": "KI",
    "Unnamed: 2": "distance",
    "Unnamed: 3": "improved_speed",
    "Unnamed: 5": "decrease_idle"
  })

In [23]:
df = df[["cycle_name", "KI", "distance", "improved_speed", "decrease_accel", "eliminate_stops", "decrease_idle"]]
df.head()

Unnamed: 0,cycle_name,KI,distance,improved_speed,decrease_accel,eliminate_stops,decrease_idle
3,2012_2,3.3,1.3,5.9%,9.5%,29.2%,17.4%
4,2145_1,0.68,11.2,2.4%,0.1%,9.5%,2.7%
5,4234_1,0.59,58.7,8.5%,1.3%,8.5%,3.3%
6,2032_2,0.17,57.8,21.7%,0.3%,2.7%,1.2%


# 4. Fetch Data From CSV / Save Data To CSV 

In [24]:
URL = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
df = pd.read_csv(URL)

In [25]:
df.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


In [26]:
PATH = "/content/drive/MyDrive/00_temp/countries.csv"
df.to_csv(PATH, index=False)