In [1]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd

In [2]:
#different urls for different versions
#working_url = 'https://www.president.gov.ua/news/speeches'
working_url = 'https://www.president.gov.ua/en/news/speeches'

In [3]:
def get_speech_text_from_preview(preview):
    url = preview.find('a')['href']
    speech_page = BeautifulSoup(urllib.request.urlopen(url), 'html.parser')
    return speech_page.find('div', attrs={'itemprop':'articleBody'}).text.strip()

In [4]:
column_lambda_dictionary = {
    'headline': lambda preview: preview.find('h3').text,
    'url': lambda preview: preview.find('a')['href'],
    'date': lambda preview: preview.find('p', attrs={'class':'date'}).text.strip(),
    'speech_text': get_speech_text_from_preview
}

In [5]:
final_dict = { key: [] for key in column_lambda_dictionary.keys()}

In [6]:
final_dict

{'headline': [], 'url': [], 'date': [], 'speech_text': []}

In [7]:
is_last_page = False
disabled_class_name = 'disabled'

while not is_last_page:
    page = urllib.request.urlopen(working_url)
    
    page_soup = BeautifulSoup(page, 'html.parser')
    
    content = page_soup.find('div', attrs={'class': 'cat_list'})
    speech_previews = content.findAll('div', attrs={'class':'item_stat cat_stat'}) 
    
    for preview in speech_previews:
        for section in column_lambda_dictionary.keys():
            final_dict[section].append(column_lambda_dictionary[section](preview))
    
    next_page_object = page_soup.find('div', attrs={'class':'pagination'}).findAll('i')[-1].parent
    working_url = next_page_object['href']
    
    is_last_page = disabled_class_name in next_page_object.get("class")

In [8]:
data = pd.DataFrame.from_dict(final_dict)

In [9]:
data.head()

Unnamed: 0,headline,url,date,speech_text
0,\nStatement by President of Ukraine at the Un...,https://www.president.gov.ua/en/news/vistup-pr...,20 February 2019 - 18:18,"Madam President,\nExcellencies,\nLadies and Ge..."
1,\nPresident’s speech at a special session of t...,https://www.president.gov.ua/en/news/vistup-pr...,19 February 2019 - 14:04,"Dear great friend of Ukraine, Mr. Donald Tusk!..."
2,\nPresident's speech in the Verkhovna Rada on ...,https://www.president.gov.ua/en/news/vistup-pr...,7 February 2019 - 12:09,Honorable Chairman of the Verkhovna Rada of Uk...
3,\nAddress of President of Ukraine at the Tra...,https://www.president.gov.ua/en/news/vistup-pr...,16 January 2019 - 21:35,"Your Excellences,\nDear Mr. Speaker, Mr. Prime..."
4,\nPresident’s address on the occasion of grant...,https://www.president.gov.ua/en/news/promova-p...,6 January 2019 - 11:11,"A great historic event, in which the Lord was ..."


In [10]:
FILENAME = 'speeches_en.csv'
FOLDER_PATH = '../data/'

In [11]:
data.to_csv(f'{FOLDER_PATH}{FILENAME}', encoding='utf-8')