## Import packages and define functions

In [6]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import trange, tqdm
import requests
import copy
import re
import time

import scraping_class
logfile = 'kmd_scrapelog2.txt' # name your log file.
connector = scraping_class.Connector(logfile)

In [7]:
def get_soup(url, project_name):
    #System sleep:
    time.sleep(0.2)
    
    #Get html:
    file = connector.get(url, project_name)[0] 
    file.encoding = "utf-8" # Make Danish letters Æ, Ø Å understandable
    html = file.text
    soup = BeautifulSoup(html, 'html')
    return soup

def collect_links(url_list, name_list, baselink, search_item = ('a'), re_pattern = 'F[0-9]+\.htm'):
    #Creating empty lists:
    refs, names, links, name_orig = ([] for i in range(4))
    
    #Converts url to a one-element list, if only one url was inputted:
    if isinstance(url_list, str):
        url_list = [url_list]
    if isinstance(name_list, str):
        name_list = [name_list]
    
    #Collects area-references, area-names and constructs links to each area:
    for i, url in enumerate(url_list):

        soup = get_soup(url, 'collect_links')
        items = soup.find_all(search_item)
        
        #Removes any tags not in the format "F{digits}.htm"
        pattern = re.compile(re_pattern)
        for item in items:
            if item.has_attr('href'):
                if pattern.search(item['href']):
                    refs.append(pattern.search(item['href']).group())
                    names.append(item.text.strip())
                    #links.append(baselink+pattern.search(item['href']).group())
                    name_orig.append(name_list[i])
          
    #Outputs results as dataframe:
    out_df = pd.DataFrame([names, refs, name_orig]).T.rename(columns={0:'names', 1:'refs', 2:'name_orig'})
    out_df['links'] = baselink + out_df['refs']
    return out_df

def get_data_voting_area(url):
    soup = get_soup(url, 'get_data_voting_area')

    general = soup.find_all('td', {'class':'text-right'})
    eligible_count = general[0].text
    time_stamp = general[2].text
    address = general[3].text
    blank = general[6].text
    invalid = general[8].text
    total_valid = general[10].text
    total_casted = general[12].text

    #Party votes:
    table = soup.find_all('div', {'class':'row table-like-row'})
    party, vote_count = [], []
    for row in range(1, len(table)):
        row_data = table[row]
        party_letter = row_data.find('span', {'class':'parti-letter'}).text
        if (len(party_letter)==1):
            party.append(row_data.find('a').text)
        else:
            party.append(row_data.find('div', {'class':'table-like-cell col-xs-7 col-sm-6 col-md-4'}).text[len(party_letter):])
        vote_count.append(row_data.find('div', {'class':'table-like-cell col-xs-3 col-sm-4 col-md-2 text-right roboto-bold'}).text)
    #Creates dataframe in long format:
    out_df = pd.DataFrame([party, vote_count]).T.rename(columns={0:'party', 1:'votes'})

    out_df['eligible_count'] = eligible_count
    out_df['blank'] = blank
    out_df['invalid'] = invalid
    out_df['total_valid'] = total_valid
    out_df['total_casted'] = total_casted

    out_df['time_stamp'] = time_stamp
    out_df['address'] = address

    return out_df

def get_data_election(year, get_data_method = get_data_voting_area):
    url = get_url(year)
    baselink = f'https://www.kmdvalg.dk/fv/{year}/'

    data1 = collect_links(url, 'FV',baselink = baselink)
    data2 = collect_links(list(data1['links']), list(data1['names']), baselink)
    data3 = collect_links(list(data2[data2['refs'].str.len()<13]['links']), list(data2[data2['refs'].str.len()<13]['name_orig']), baselink)
    data4 = pd.concat([data3, data2])
    data4 = data4[data4['refs'].str.len()>=13].reset_index(drop=True)

    df_list = []
    for i in trange(len(data4)):
        df = get_data_method(data4['links'][i])
        df['names'] = data4['names'][i]
        df['constituency'] = data4['name_orig'][i]
        df['refs'] = data4['refs'][i]
        df['links'] = data4['links'][i]
        df_list.append(df)
    out_df = pd.concat(df_list)
    out_df['year'] = year
    return out_df

def get_data_voting_area_2011(url):
    soup = get_soup(url, 'get_data_voting_area_2011')

    dfs = []
    data = soup.find_all('tr', {'class':['tableRowPrimary', 'tableRowSecondary']})
    data2 = soup.find_all('td', {'class':'statusText'})

    for data_sub in data[:-5]:
        party = data_sub.find_all('td')[1].text
        vote = data_sub.find_all('td')[2].text
        dfs.append([party, vote])

    eligible_count = data2[1].text
    time_stamp = data2[4].text
    address = data2[5].text
    municipality = soup.find_all('tr')[2].find_all('td')[0].find_all('td')[1].text

    general = data[-5:]
    blank = general[2].find_all('td')[2].text
    invalid = general[3].find_all('td')[2].text
    total_valid = general[1].find_all('td')[2].text
    total_casted = general[4].find_all('td')[2].text

    out_df = pd.DataFrame(dfs, columns=['party', 'votes'])

    out_df['eligible_count'] = eligible_count
    out_df['blank'] = blank
    out_df['invalid'] = invalid
    out_df['total_valid'] = total_valid
    out_df['total_casted'] = total_casted

    out_df['time_stamp'] = time_stamp
    out_df['address'] = address
    
    out_df['municipality'] = municipality

    return out_df

def get_url(year):
    if year==2011:
        url ='https://www.kmdvalg.dk/fv/2011/'
    else:
        url = f'https://www.kmdvalg.dk/fv/{year}/KMDValgFV.html'
    return url

def fix_count_columns(df_, columns = ['votes', 'eligible_count', 'blank', 'invalid', 'total_valid', 'total_casted']):
    for col in columns:
        #Removes "." from count variables and converts to integers:
        df_[col] = df_.apply(lambda x: x[col].replace(".", ""), axis=1).astype('int64')
    return df_

## Scraping data for election 2011, 2015 and 2019

In [8]:
#data_2019 = get_data_election(2019)
data_2015 = get_data_election(2015)
data_2011 = get_data_election(2011, get_data_method = get_data_voting_area_2011)

100%|██████████| 1384/1384 [21:10<00:00,  1.09it/s]
 53%|█████▎    | 740/1389 [10:50<09:30,  1.14it/s]


OSError: [Errno 22] Invalid argument

In [9]:
data_2019 = fix_count_columns(data_2019)
#data_2015 = fix_count_columns(data_2015)
#data_2011 = fix_count_columns(data_2011)

## Mapping municipalities

In [10]:
df1 = data_2011.copy()
df1 = df1[['refs', 'names', 'municipality']]
df1 = df1.rename(columns={'names':'voting_area'})

url = 'https://www.kmdvalg.dk/kv/2017/KMDValgKV.html'
basislink = 'https://www.kmdvalg.dk/kv/2017/'
links1 = collect_links(url_list=url, name_list = 'KV', baselink = basislink, search_item = ('a'), re_pattern = 'K[0-9]+\.htm')
df_list = []
for i in range(len(links1)):
    df2 = collect_links(url_list=links1['links'][i], name_list =links1['names'][i], baselink = basislink, search_item = ('a'), re_pattern = 'K[0-9]+\.htm')
    df2['voting_area'] = df2['names']
    df2 = df2[['refs', 'voting_area', 'name_orig']]
    df_list.append(df2)
df2 = pd.concat(df_list)
df2['refs'] = [string.replace('K', 'F') for string in df2['refs']]
df2 = df2.rename(columns={'name_orig':'municipality'})
df2['municipality'] = df2['municipality']+' Kommune'

mapping = pd.concat([df1, df2]).drop_duplicates(subset='refs')

NameError: name 'data_2011' is not defined

## Export data

In [49]:
data_2019 =data_2019.merge(mapping, on='refs', how='left')
data_2015 =data_2015.merge(mapping, on='refs', how='left')
data_2011 =data_2011.merge(mapping, on='refs', how='left')

In [51]:
data_2011.to_csv('data2011.csv', encoding= 'utf-8-sig')
data_2015.to_csv('data2015.csv', encoding= 'utf-8-sig')
data_2019.to_csv('data2019.csv', encoding= 'utf-8-sig')
# mapping.to_csv('mapping.csv')