In [229]:
import requests
from bs4 import BeautifulSoup
import lxml
import re
import pandas as pd
import numpy as np

In [230]:
# encoding: UTF-8

## Variables

### Set starting URLs

In [231]:
candidates_url = 'https://www.cvk.gov.ua/pls/vm2020/pvm008pt001f01=695pt00_t001f01=695.html'
elected_url = 'https://www.cvk.gov.ua/pls/vm2020/pvm002pt001f01=695pt00_t001f01=695.html'

### Set city council URL

In [233]:
url_councils = 'https://www.cvk.gov.ua/pls/vm2020/'

bvo_urls = ['https://www.cvk.gov.ua/pls/vm2020/pvm057pid112=30pid102=12259pf7691=64406pt001f01=695rej=0pt00_t001f01=695.html', 'https://www.cvk.gov.ua/pls/vm2020/pvm057pid112=30pid102=479pf7691=64420pt001f01=695rej=0pt00_t001f01=695.html', 'https://www.cvk.gov.ua/pls/vm2020/pvm057pid112=30pid102=4048pf7691=63909pt001f01=695rej=0pt00_t001f01=695.html']

### Select regions for parsing

In [234]:
regions = '0:3'

## Functions

In [235]:
def get_region_urls(starting_page):
    '''
    Function returns list of councils in all regions
    '''
    councils = []
    source = requests.get(starting_page)
    soup = BeautifulSoup(source.content, 'lxml')
    table = soup.find_all('table', attrs={'class':'t2'})[1].find('tbody')
    rows = table.find_all('tr')
    if starting_page == candidates_url:
        for row in rows:
            try:
                city_cols = row.find('a', class_='a1')['href']
                councils.append(url_councils+city_cols)
            except TypeError:
                continue
    elif starting_page == elected_url:
        for row in rows:
            try:
                city_cols = row.find_all('a')[2]['href']
                councils.append(url_councils+city_cols)
            except IndexError:
                continue
    return councils

In [236]:
def get_councils_url(council):
    '''
    Function selects regions and returns links to city councils
    '''
    s = slice(*map(int, regions.split(':')))
    council_links = council[s]
    candidates = []
    for oblast in council_links:
        obl_url = requests.get(oblast)
        obl_soup = BeautifulSoup(obl_url.content, 'lxml')
        for i in obl_soup.find_all('a', class_='a1'):
            candidates.append(url_councils+i['href'])
    return candidates

In [237]:
def get_candidates_df():
    '''
    Function returns dataframe with info about candidates with additional columns about their party,
    region and council.
    '''
    candidate_dfs = []
    for council_url in candidates:
        dfs = pd.read_html(council_url, header=0)
        n_candidates = dfs[3]
        party = np.repeat(n_candidates['Партія'], n_candidates['Кількість кандидатів у списку']).tolist()
        candidate_df = dfs[4]
        candidate_df = candidate_df[~candidate_df['№ ТВО, за яким закріплено'].str.contains(r'[^0-9 Перший кандидат]', regex=True, case=False)].reset_index(drop=True)
        vinn = requests.get(council_url)
        vinn_soup = BeautifulSoup(vinn.content, 'lxml')
        location = vinn_soup.find('p', class_='p2').contents
        oblast = location[0]
        council = location[2]
        candidate_df['Партія'] = party
        candidate_df['Область'] = oblast
        candidate_df['Міська рада'] = council
        candidate_dfs.append(candidate_df)
    candidate_dfs = pd.concat(candidate_dfs).reset_index(drop=True)
    return candidate_dfs

In [238]:
def get_elected_df():
    '''
    Function returns dataframes with info about elected candidates from TVO
    '''
    tvo_dfs = []
    for council_url in elected:
        elected_dfs = pd.read_html(council_url, header=0)
        elected_df = elected_dfs[5]
        elected_df = elected_df[~elected_df['№ ТВО, за яким закріплено'].str.contains(r'[^0-9 Перший кандидат]', regex=True, case=False)].reset_index(drop=True)
        tvo_dfs.append(elected_df)
    final_tvo_dfs = pd.concat(tvo_dfs).reset_index(drop=True)
    return final_tvo_dfs

## Starting parser

In [239]:
candidate_councils = get_region_urls(candidates_url)
elected_councils = get_region_urls(elected_url)

In [240]:
candidates = get_councils_url(candidate_councils)
elected = get_councils_url(elected_councils)
elected = [x for x in elected if x not in bvo_urls]

## Saving dataframes to csv

In [241]:
get_candidates_df().to_csv('./data/candidates_df.csv', sep ='\t', index=False)
get_elected_df().to_csv('./data/tvo_df.csv', sep='\t', index=False)