In [383]:
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point, Polygon

cwd =os.getcwd()

### Structure socioeconomic data by year

In [385]:
socioeco_df = pd.read_excel(cwd+'/socioeconomic_data/socioeco_data.xlsx', sheet_name = 'Samlet')

postnr_columns =['POSTNR', 'YEAR', 'KOMKODE', 'SALGSPRIS']
kom_columns = ['KOMKODE', 'YEAR', 'DISINDK', 'KRIMINELITET', 'LAVINDK', 'SKILSMISSER', 'ANDEL_INDV']

post_df = socioeco_df[postnr_columns]
kom_df = socioeco_df[kom_columns]

years = [2011, 2015, 2019]
post_split = [post_df[post_df['YEAR'] == year] for year in years]
kom_split = [kom_df[kom_df['YEAR'] == year] for year in years]

In [387]:
post_split[2]

Unnamed: 0,POSTNR,YEAR,KOMKODE,SALGSPRIS
2841,1000,2019,101,37082.250000
2842,1050,2019,101,37082.250000
2843,1051,2019,101,37082.250000
2844,1052,2019,101,37082.250000
2845,1053,2019,101,37082.250000
...,...,...,...,...
4259,9970,2019,813,6252.666667
4260,9981,2019,813,8695.750000
4261,9982,2019,813,6225.000000
4262,9982,2019,860,6225.000000


### Merge school data and socioeconomic data by postal code and kommune

In [389]:
df_list = []
for data, year in zip(post_split, [2011, 2015, 2019]):
    # load data
    soc_data = data
    school_data = pd.read_csv(cwd+f'/school_data/school_clean_{year-1}-{year}.csv', index_col=0)

    # merge data
    merged = pd.merge(left = school_data, right = soc_data, left_on = ['beliggenhedskommunenr', 'postnr'], right_on = ['KOMKODE', 'POSTNR'])

    # format
    merged.columns = merged.columns.str.lower()
    merged = merged.T.drop_duplicates().T

    merged.to_csv(f'postnr_background_vars{year}.csv')

    df_list.append(merged)

for data, year in zip(kom_split, [2011, 2015, 2019]):
    data.drop_duplicates()\
        .to_csv(f'kom_background_vars{year}.csv')

In [391]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import pyproj


In [393]:
# Well, that was horrible...
# conda remove pyproj
# pip install pyproj
# conda install geopandas 

# Læs baggrundsvariable of oversæt til utm
df = pd.read_csv('postnr_background_vars2019.csv', index_col=0)

c = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry = c)

gdf.crs = {'init': 'epsg:4326'}
gdf_utm = gdf.to_crs(epsg = 32632)

# frasorter efterskoler (behold privat- og friskoler for at undgå for stort fald i antal observationer)
gdf_utm = gdf_utm[gdf_utm['institutionstype2'] == 'Grundskoler']

In [395]:
# Læs valgdistrikter og tilføj centroid
valg_distrikt = gpd.read_file(cwd+'\scrape_geodata\geodata\dagi_10m_nohist_l1.afstemningsomraade\\afstemningsomraade.shp', driver = 'ESRI Shapefile')
valg_distrikt['centroid']=valg_distrikt['geometry'].centroid

In [397]:
valg_geo = valg_distrikt.set_geometry('centroid')
schools_geo = gdf_utm.geometry.unary_union

In [399]:
# Find for hver centroid den nærmeste skole
# For valg_distrikt['c½entroid'] find nærmeste gdf_utm['geometry'] ogtilføj tilhørende 'institutionsnummer'
pts3 = gdf_utm.geometry.unary_union
def near(point, pts = pts3):
    nearest = gdf_utm.geometry == nearest_points(point, pts)[1]
    return gdf_utm[nearest]['institutionsnummer'].values[0]
    
valg_geo['institutionsnummer'] = valg_geo.apply(lambda row: near(row.geometry), axis = 1)

In [401]:
"""test_valg = valg_geo.iloc[0].centroid

nearest = nearest_points(test_valg, schools_geo)[1]
print(test_valg)
print(nearest)
gdf_utm[gdf_utm['geometry'] == nearest]"""

"test_valg = valg_geo.iloc[0].centroid\n\nnearest = nearest_points(test_valg, schools_geo)[1]\nprint(test_valg)\nprint(nearest)\ngdf_utm[gdf_utm['geometry'] == nearest]"

In [403]:
# merge election to school in 'intitutionsnummer
merge1 = pd.merge(left=valg_geo, right = gdf_utm, on = 'institutionsnummer')
merge1.columns

Index(['objectid', 'id_lokalid', 'id_namespa', 'status', 'geometrist',
       'virkningfr', 'virkningti', 'virkningsa', 'forretning', 'registreri',
       'registre00', 'registre01', 'forretni00', 'forretni01', 'dagiid',
       'navn_x', 'redigering', 'dataspecif', 'landekode', 'skala',
       'afstemning', 'afstemni00', 'afstemni01', 'kommunekod', 'kommunelok',
       'opstilling', 'opstilli00', 'udtraeksda', 'gmlid', 'geometry_x',
       'centroid', 'institutionsnummer', 'karakter', 'soc_ref',
       'beliggenhedskommunenr', 'lat', 'lon', 'postnr', 'navn_y',
       'institutionstype2', 'institutionstype3', 'coordinates', 'year',
       'salgspris', 'geometry_y'],
      dtype='object')

In [405]:
# merge merge1 to mapping on 'objectid'
mapping = pd.read_csv(cwd+'/scrape_geodata/mapping_geodata_v3.csv', sep =';')
mapping.objectid = mapping.objectid.astype(str)
merge2 = pd.merge(left = merge1, right = mapping[['objectid', 'refs']], on = 'objectid')

In [407]:
# merge to election results
election_result = pd.read_csv(cwd+'/scrape_geodata/data2019.csv')

merge3 = pd.merge(left = merge2, right = election_result, on = 'refs')
merge3.kommunekod = merge3.kommunekod.astype(int).astype(str)

In [409]:
# merge to background vars on kommune level
background = pd.read_csv('kom_background_vars2019.csv', index_col = 0)

background.KOMKODE = background.KOMKODE.astype(str)
background

all_2019 = pd.merge(left = merge3, right = background, left_on = 'kommunekod', right_on = 'KOMKODE')

# Not necessary, and the file is very big, but can be used as checkpoint
# all_2019.to_csv('all_2019.csv')


In [411]:
# all_2019 = pd.read_csv('all_2019.csv)
all_2019.columns = all_2019.columns.str.lower()

col_vars = ['objectid','navn_x', 'afstemni00','kommunekod', 'institutionsnummer', 'karakter', 'soc_ref', 'postnr', 'navn_y', 'institutionstype2', 'institutionstype3', 'year_x', 'salgspris', 'party', 'votes', 'eligible_count', 'blank', 'invalid', 'total_valid', 'total_casted', 'time_stamp', 'names', 'constituency', 'voting_area', 'municipality', 'disindk', 'kriminelitet', 'lavindk', 'skilsmisser', 'andel_indv']

all_2019[all_2019['refs'] == 'F809746006.htm'][['party', 'votes']]

Unnamed: 0,party,votes
13222,Socialdemokratiet,1.659
13223,Radikale Venstre,547.0
13224,Det Konservative Folkeparti,365.0
13225,Nye Borgerlige,137.0
13226,Klaus Riskær Pedersen,51.0
13227,SF - Socialistisk Folkeparti,534.0
13228,Liberal Alliance,150.0
13229,Kristendemokraterne,103.0
13230,Dansk Folkeparti,457.0
13231,Stram Kurs,85.0


In [413]:
temp_2019 = all_2019[col_vars]

In [415]:
temp_2019[temp_2019['objectid'] == '1462182'][['party', 'votes']]
# 1275543

Unnamed: 0,party,votes
11895,Socialdemokratiet,1.555
11896,Radikale Venstre,1.099
11897,Det Konservative Folkeparti,494.0
11898,Nye Borgerlige,109.0
11899,Klaus Riskær Pedersen,85.0
11900,SF - Socialistisk Folkeparti,734.0
11901,Liberal Alliance,359.0
11902,Kristendemokraterne,90.0
11903,Dansk Folkeparti,270.0
11904,Stram Kurs,130.0


## Fix the absolute shitshow that is the vote count formatting

In [417]:
# We are facing an issue with the formatting of the scraped data. Because of the separator '.', when we load the data, python thinks that every number larger than 1000 is actually decimal. We need to identify, which numbers need to be multiplied by 1000. To do this, we find modulus 1 for each vote count, and if this is different from 0, we should multiply the number by 1000. There is however one issue with this approach: If a party has received exactly 1000 (or 2000 or 3000 etc) votes at a polling place, our data reports the number of votes to be 1.0. Before making the conversion, we need to check if this is the case. At some smaller polling places, however, some parties may actually have received only 1 vote. To get around this, we first check manually for the maximum voting count by party, given that modulus 1 of the voting count is different from 0, to find the maximum number of votes for each party. Then, if the maximum voting count is, say, 4792, we check the KMD website, for each of the polling places, that report a voting count of 1, 2, 3 and 4. If any of these voting counts are actually 1000 etc., they will be multiplied by 1000. We do this for every praty, that have received more than 1000 votes at a single polling place. Then we identify observations with modulus 1 different from zero and multiply these by 1000. 
# There is a risk of missing some observations, in case the maximum voting count for a party is a multiple of 1000. In that case, we won't recognise the observation as one that should be multiplied by 1000. This is, however, unlikely enough for us to assume it won't be the case.
# For the vote counts, that are constant within each polling place (such as total votes, number of eligible votes etc), we follow a slightly different procedure: First, we identify the lowest count among the polling places; any whole number below this must me multiplied by 1000. Luckily, all polling places have vote counts high enough for this to completely separate whole number counts that should and should not be multiplied. Afterwards, we multiply all vote counts with modulus 1 different from 0 by 1000.
# When following this procedure and calculating voting shares based on the resulting vote counts, all shares sum to 1. This serves to indicate that no mistakes have happened during the calculations.

# Ideally, when we scrape, we should store the data as strings, so we can delete the separators before converting them to ints. We should try and fix this, the next time we scrape. If not, this method should be our fall back.


# max number of votes for A: 4.792  (4792) objectid 1306109
# max number of votes for V: 4.556 (4556) objectid 1306030
# max number of votes for B: 2.448 (2448) objectid 1306375
# max number of votes for C: 3.218 (3218) objectid 1306109
# max number of votes for D: no problem
# max number of votes for E: no problem
# max number of votes for F: 1.775 (1775) objectid 1306109
# max number of votes for I: no problem
# max number of votes for K: 1.114 (1114) objectid 1298885
# max number of votes for O: 1.104 (1104) objectid 1289687
# max number of votes for P: no problem
# max number of votes for Ø: 2.608 (2608) objectid 1304839
# måx number of votes for Å: 1.142 (1142)

# Check for problem parties (manually on kmd) if any af the parties have received exactly 1000, 2000 etc votes.
#  Limits:
# Venstre and Socialdemokratiet: 5
# Konservative: 4
# Radikale Venstre, Enhedslisten: 3
# SF, Kristendemokraterne, Dansk Folkeparti, Alternativet: 2

# 40 observations have to be checked manually.
# 3 observations are reporter as 1.0 or 2.0 but should be multiplied by 1000.

# For other vote counts:
# blank: not relevant (max 170 blank votes at one polling place)
# invalid: not relevant (max 63 invalid votes at one polling place)
# 

In [419]:
# Template for checking max number of votes:
temp_2019[temp_2019['votes']%1 != 0][temp_2019['party'] == 'Socialdemokratiet'][['party','objectid','votes']].sort_values(by = 'votes')

# Finding observations to check with kmd
limit5 = ['Venstre, Danmarks Liberale Parti', 'Socialdemokratiet']
limit4 = ['Det Konservative Folkeparti']
limit3 = ['Radikale Venstre', 'Enhedslisten - De Rød-Grønne']
limit2 = ['SF - Socialistisk Folkeparti', 'Kristendemokraterne', 'Dansk Folkeparti', 'Alternativet']

limits = {5: limit5, 4: limit4, 3: limit3, 2:limit2}

obs_to_check = []

for limit, parties in limits.items():
    for party in parties:
        df = temp_2019[temp_2019['votes']<limit][temp_2019['party'] == party][temp_2019['votes']%1==0]
        obs_to_check.append(df)

check_df = pd.concat(obs_to_check)

# To inspect and check with kmd
check_df.sort_values(by = 'constituency')[['municipality','constituency', 'navn_x','party','votes']]
obs_votes_1000 = [8198, 12558, 873] 

In [421]:
# For some reason i can't get pandas to update the votes inplace, so now i'm taking a detour..
temp_2019['vote_multiplyer'] = np.where(temp_2019['votes']%1 != 0, 1000, 1)

temp_2019.loc[obs_votes_1000,'vote_multiplyer'] = 1000
temp_2019.loc[obs_votes_1000]['vote_multiplyer']

8198     1000
12558    1000
873      1000
Name: vote_multiplyer, dtype: int32

In [425]:
temp_2019['votes_v2'] = temp_2019['votes']*temp_2019['vote_multiplyer']

#eligible count, blank, invalid, total_valid, total_casted
# repeat procedure for all these variables, maybe make function. Not sure that can be done without some semi dangerous assumptions... 
# will maybe fix eligible count, if i have time - if not, rely on plan a


temp_2019[temp_2019['party'] == 'Enhedslisten - De Rød-Grønne']['votes_v2'].max()
#temp_2019.to_csv('test.csv')

2608.0

In [426]:
left_parties = ['Socialdemokratiet', 'Radikale Venstre', 'SF - Socialistisk Folkeparti', 'Enhedslisten - De Rød-Grønne', 'Alternativet']
right_parties = ['Det Konservative Folkeparti', 'Nye Borgerlige', 'Klaus Riskær Pedersen', 'Liberal Alliance', 'Kristendemokraterne', 'Dansk Folkeparti', 'Stram Kurs', 'Venstre, Danmarks Liberale Parti']

left_dummy = [int(party in left_parties) for party in temp_2019['party']]
right_dummy = [int(party in right_parties) for party in temp_2019['party']]
independent_dummy = [int(party not in left_parties+right_parties) for party in temp_2019['party']]


temp_2019['left_dummy'], temp_2019['right_dummy'], temp_2019['independent_dummy'] = left_dummy, right_dummy, independent_dummy

temp_2019['left_votes'], temp_2019['right_votes'], temp_2019['independent_votes'] = left_dummy*temp_2019['votes_v2'], right_dummy*temp_2019['votes_v2'], independent_dummy*temp_2019['votes_v2']
temp_2019.to_csv('test.csv')


votes_df = temp_2019[['objectid', 'votes_v2', 'left_votes', 'right_votes', 'independent_votes']].groupby(by = 'objectid').sum()

In [427]:
pre_merge = temp_2019.drop(['votes', 'votes_v2', 'left_votes', 'right_votes', 'independent_votes', 'left_dummy', 'right_dummy', 'independent_dummy', 'vote_multiplyer', 'party'], axis = 1)
finalish_df = pd.merge(left = pre_merge, right = votes_df, on = 'objectid')\
            .drop_duplicates(subset = 'objectid')\
            .reset_index(drop = True)

In [428]:
# Fix eligible count, total casted, valid votes
# limit of 4 works - every whole number lower than these has to be multiplyed by 1000 together with every non-whole number
# Find lower limit:
min_el = finalish_df['eligible_count'][finalish_df['eligible_count']%1 == 0][finalish_df['eligible_count']>4].min()
min_tot = finalish_df['total_casted'][finalish_df['total_casted']%1 == 0][finalish_df['total_casted']>4].min()
min_val = finalish_df['total_valid'][finalish_df['total_valid']%1==0][finalish_df['total_casted']>4].min()

# Fix eligible count
finalish_df['el_multiplyer'] = np.where(finalish_df['eligible_count']%1 != 0, 1000, 1)# higher than 1000
el_index = list(finalish_df[finalish_df['eligible_count']%1==0][finalish_df['eligible_count']<min_el].index) 
finalish_df.loc[el_index, 'el_multiplyer'] = 1000 # multiples of 1000
finalish_df['eligible_count_v2'] = finalish_df['eligible_count']*finalish_df['el_multiplyer'] # update

# Fix total casted
finalish_df['tot_multiplyer'] = np.where(finalish_df['total_casted']%1 != 0, 1000, 1) # higher than 1000
tot_index = list(finalish_df[finalish_df['total_casted']%1==0][finalish_df['total_casted']<min_tot].index)
finalish_df.loc[tot_index, 'tot_multiplyer'] = 1000 # multiples of 1000
finalish_df['total_casted_v2'] = finalish_df['total_casted']*finalish_df['tot_multiplyer'] #update

# Fix total valid
finalish_df['val_multiplyer'] = np.where(finalish_df['total_valid']%1 != 0, 1000, 1) # higher than 1000
val_index = list(finalish_df[finalish_df['total_valid']%1==0][finalish_df['total_valid']<min_val].index)
finalish_df.loc[val_index, 'val_multiplyer'] = 1000 # multiples of 1000
finalish_df['total_valid_v2'] = finalish_df['total_valid']*finalish_df['val_multiplyer']

In [429]:
finalish_df.columns
finalish_df['left_share'] = finalish_df['left_votes']/finalish_df['total_valid_v2']
finalish_df['right_share'] = finalish_df['right_votes']/finalish_df['total_valid_v2']
finalish_df['independent_share'] = finalish_df['independent_votes']/finalish_df['total_valid_v2']

In [434]:
(finalish_df['left_share']+finalish_df['right_share']+finalish_df['independent_share']).describe() # AND THEY SUM UP TO ONE :DDDDD

count    1.383000e+03
mean     1.000000e+00
std      8.118549e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
dtype: float64

In [431]:
final_df = finalish_df[['objectid', 'afstemni00','navn_x', 'kommunekod', 'municipality', 'constituency','postnr', 'time_stamp', 'votes_v2', 'left_votes', 'right_votes', 'independent_votes', 'blank', 'invalid', 'left_share', 'right_share', 'independent_share', 'institutionsnummer', 'navn_y', 'institutionstype2', 'institutionstype3', 'karakter', 'soc_ref', 'salgspris', 'disindk', 'kriminelitet', 'lavindk','skilsmisser', 'andel_indv']]


In [432]:
final_df = final_df.rename(columns = {'navn_x': 'afstem_navn2', 'afstemni00': 'afstem_navn1', 'soc_ref': 'karakter_soc_ref', 'navn_y': 'skole_navn', 'blank': 'blank_votes', 'invalid': 'invalid_votes', 'votes_v2': 'votes', 'municipality': 'kommunenavn', 'constituency': 'opst_kreds'})
final_df.to_csv('final_data2019.csv')

In [433]:
# Find centroids pr valgsted DONE
# Find for hver centroid den nærmeste skole DONE
# Lav column med institutionsnummer for nærmeste skole DONE
# Merge valgdistrikter med skoledata på institutionsnummer DONE
# Merge på mapping fil på objectid DONE
# merge på valgresultater fra 2019 på refs DONE
# split i kommune og postnnr DONE
# ryd op i data - kollaps valgresultater
# ryd op i kode
# tilføj huspriser for kbhk