# Importing packages

In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

# Functions to parse

In [2]:
def parse_website(url):
    wsite = requests.get(url)
    if wsite.status_code != 200:
        print('Error loading website '+url)
    else:
        c = wsite.content
        soup = BeautifulSoup(c, 'html.parser')
        return soup

In [3]:
def set_title(soup):
    title = ''
    name = soup.find('h1', {'class':'o-hero-freepage__title f-title-3'}).get_text()
    
    return title + name

In [4]:
def set_description(soup):
    description = ''
    section = soup.find('div', {'class': 'm-block__content f-rte f-rte--block'})
    if section.get_text() == None:
        ps = section.find_all('p')
        for p in ps:
            if ('ant to know more about this policy' in p.get_text()) == False:
                end = p.get_text().find('\n', 0)
                s = p.get_text()[:end].strip()
                description = description + ' '+s
    else:
        text = section.get_text().strip()
        end = text.find('\n', 0)
        s = text[0:end]
        description = description + s
    return description

In [5]:
def set_source_and_last_updated(soup):
    source = ''
    last_updtd = ''
    subtitles = soup.find_all('span', {'class':'o-hero-freepage__meta'})
    for subtitle in subtitles:
        if 'Source:' in subtitle.get_text():
            source = source + subtitle.get_text().replace('Source: ', '')
        elif 'pdated' in subtitle.get_text():
            last_updtd = last_updtd + subtitle.get_text().replace('Last updated: ', '')
    return source,last_updtd

In [6]:
def set_learn_more(soup):
    learn_more = ''
    link = soup.find('a', {'class': 'a-link a-link--accent'})
    if link != None:
        learn_more = learn_more + link.attrs['href']
    return learn_more

In [7]:
def set_main_content(soup):
    topics = ''
    policy_types = ''
    sectors = ''
    technologies = ''
    end_uses = ''
    
    content_list = soup.find_all('div', {'class': 'o-policy-content__list'})
    for content in content_list:
        category = content.span.get_text()
        ls = content.find_all('span', {'class': 'a-tag__label'})
        if category == 'Topics':
            for i,find in enumerate(ls):
                if i != (len(ls)-1):
                    topics = topics + find.get_text()+'|'
                else:
                    topics = topics + find.get_text()
        elif category == 'Policy types':
            for i,find in enumerate(ls):
                if i != (len(ls)-1):
                    policy_types = policy_types + find.get_text()+'|'
                else:
                    policy_types = policy_types + find.get_text()
        elif category == 'Sectors':
            for i,find in enumerate(ls):
                if i != (len(ls)-1):
                    sectors = sectors + find.get_text()+'|'
                else:
                    sectors = sectors + find.get_text()
        elif category == 'Technologies':
            for i,find in enumerate(ls):
                if i != (len(ls)-1):
                    technologies = technologies + find.get_text()+'|'
                else:
                    technologies = technologies + find.get_text()
        elif category == 'End uses covered':
            ls2 = content.find_all('li', {'class': 'o-policy-content-list__item'})
            for i,find in enumerate(ls2):
                if i != (len(ls2)-1):
                    end_uses = end_uses + find.get_text()+'|'
                else:
                    end_uses = end_uses + find.get_text()

    return topics, policy_types, sectors, technologies, end_uses

In [8]:
def set_side_content(soup):
    country = ''
    year = 0
    status = ''
    jurisdiction = ''

    sidebar_list = soup.find_all('li', {'class': 'o-page__aside-item o-page__aside-item--policy'})

    for content in sidebar_list:
        category = content.span.get_text()
        ls = content.find_all('span', {'class': 'o-policy-aside-item__value'})
        if category == 'Country':
            for find in ls:
                country = find.get_text()
        elif category == 'Year':
            for find in ls:
                year = year + int(find.get_text())
        elif category == 'Status':
            for find in ls:
                status = find.get_text()
        elif category == 'Jurisdiction':
            for find in ls:
                jurisdiction = find.get_text()

    return country, year, status, jurisdiction

# Creating dictionary with the name of the policy and its url

In [11]:
policy_data = {}
wsite_root = "https://www.iea.org/policies?page="
i=1
while i < 187:
    wsite_address = wsite_root + str(i)
    print(str(i/187*100)+ r'%, current address: '+wsite_address)
    wsite = requests.get(wsite_address)
    if wsite.status_code != 200:
        print('Error loading website, page '+str(i))
    else:
        c = wsite.content
        soup = BeautifulSoup(c, 'html.parser')
        for policy in soup.find_all('a',{'class':'m-policy-listing-item__link'}):
            title = policy.string.strip()
            policy_data[title] = policy.attrs['href']
    i = i+1

0.53475935828877%, current address: https://www.iea.org/policies?page=1
1.06951871657754%, current address: https://www.iea.org/policies?page=2
1.6042780748663104%, current address: https://www.iea.org/policies?page=3
2.13903743315508%, current address: https://www.iea.org/policies?page=4
2.6737967914438503%, current address: https://www.iea.org/policies?page=5
3.2085561497326207%, current address: https://www.iea.org/policies?page=6
3.7433155080213902%, current address: https://www.iea.org/policies?page=7
4.27807486631016%, current address: https://www.iea.org/policies?page=8
4.81283422459893%, current address: https://www.iea.org/policies?page=9
5.347593582887701%, current address: https://www.iea.org/policies?page=10
5.88235294117647%, current address: https://www.iea.org/policies?page=11
6.417112299465241%, current address: https://www.iea.org/policies?page=12
6.951871657754011%, current address: https://www.iea.org/policies?page=13
7.4866310160427805%, current address: https://www

# Creating list of rows of policies

In [12]:
csv_rows = []
address = 'https://www.iea.org'
for policy in policy_data:
    url = address + policy_data[policy]
    print(url)
    soup = parse_website(url)
    if soup != None:
        title = set_title(soup)
        description = set_description(soup)
        source,last_updtd = set_source_and_last_updated(soup)
        learn_more = set_learn_more(soup)
        topics, policy_types, sectors, technologies, end_uses = set_main_content(soup)
        country, year, status, jurisdiction = set_side_content(soup)
        
        row = [title, description, source, last_updtd, learn_more, topics, policy_types, sectors, technologies, end_uses, country, year, status, jurisdiction]

        csv_rows.append(row)

?page=178
https://www.iea.org/policies/5533-rural-electrification-policy?page=178
https://www.iea.org/policies/3051-the-biomass-agreement?page=178
https://www.iea.org/policies/1874-vehicle-scrapping?page=179
https://www.iea.org/policies/3931-wind-power-programme?page=179
https://www.iea.org/policies/3861-200-kw-rooftop-programme?page=179
https://www.iea.org/policies/4315-act-creating-the-department-of-energys-rationale-for-the-organization-and-functions-of-government-agencies-related-to-energy-and-other-related-purposes?page=179
https://www.iea.org/policies/245-building-codes?page=179
https://www.iea.org/policies/1075-building-energy-codes?page=179
https://www.iea.org/policies/3890-diane-energy2000-action?page=179
https://www.iea.org/policies/5312-declaration-on-environmental-protection?page=179
https://www.iea.org/policies/2197-energy-star-commercial-buildings?page=179
https://www.iea.org/policies/224-energy-star-for-industry?page=179
https://www.iea.org/policies/1083-energy-star-labe

# Creating CSV

In [13]:
final_df = pd.DataFrame(data=csv_rows, columns = ['Policy', 'Description', 'Source', 'Last updated', 'Learn more', 'Topics', 'Policy types', 'Sectors', 'Technologies', 'End uses', 'Country', 'Year', 'Status', 'Jurisdiction']) 

In [14]:
final_df.to_csv('./policies_database_IEA.csv', index=False)#, sep='\t')

# Testing

In [None]:
#test='https://www.iea.org/policies/4681-feed-in-tariffs-for-renewable-energy?page=176'
#soup = parse_website(test)
#title=set_title(soup)
#description = set_description(soup)
#source,last_updtd = set_source_and_last_updated(soup)
#learn_more = set_learn_more(soup)
#topics, policy_types, sectors, technologies, end_uses = set_main_content(soup)
#country, year, status, jurisdiction = set_side_content(soup)