In [17]:
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point, Polygon

cwd =os.getcwd()

### Structure socioeconomic data by year

In [18]:
socioeco_df = pd.read_excel(cwd+'/socioeconomic_data/socioeco_data.xlsx', sheet_name = 'Samlet')

postnr_columns =['POSTNR', 'YEAR', 'KOMKODE', 'SALGSPRIS']
kom_columns = ['KOMKODE', 'YEAR', 'DISINDK', 'KRIMINELITET', 'LAVINDK', 'SKILSMISSER', 'ANDEL_INDV']

post_df = socioeco_df[postnr_columns]
kom_df = socioeco_df[kom_columns]

years = [2011, 2015, 2019]
post_split = [post_df[post_df['YEAR'] == year] for year in years]
kom_split = [kom_df[kom_df['YEAR'] == year] for year in years]

In [19]:
post_split[2]

Unnamed: 0,POSTNR,YEAR,KOMKODE,SALGSPRIS
2841,1000,2019,101,37082.250000
2842,1050,2019,101,37082.250000
2843,1051,2019,101,37082.250000
2844,1052,2019,101,37082.250000
2845,1053,2019,101,37082.250000
...,...,...,...,...
4259,9970,2019,813,6252.666667
4260,9981,2019,813,8695.750000
4261,9982,2019,813,6225.000000
4262,9982,2019,860,6225.000000


### Merge school data and socioeconomic data by postal code and kommune

In [31]:
df_list = []
for data, year in zip(post_split, [2011, 2015, 2019]):
    # load data
    soc_data = data
    school_data = pd.read_csv(cwd+f'/school_data/school_clean_{year-1}-{year}.csv', index_col=0)

    # merge data
    merged = pd.merge(left = school_data, right = soc_data, left_on = ['beliggenhedskommunenr', 'postnr'], right_on = ['KOMKODE', 'POSTNR'])

    # format
    merged.columns = merged.columns.str.lower()
    merged = merged.T.drop_duplicates().T

    merged.to_csv(f'postnr_background_vars{year}.csv')

    df_list.append(merged)

for data, year in zip(kom_split, [2011, 2015, 2019]):
    data.drop_duplicates()\
        .to_csv(f'kom_background_vars{year}.csv')

In [32]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
import pyproj


In [33]:
# Well, that was horrible...
# conda remove pyproj
# pip install pyproj
# conda install geopandas 

# Læs baggrundsvariable of oversæt til utm
df = pd.read_csv('postnr_background_vars2019.csv', index_col=0)

c = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry = c)

gdf.crs = {'init': 'epsg:4326'}
gdf_utm = gdf.to_crs(epsg = 32632)

# frasorter efterskoler (behold privat- og friskoler for at undgå for stort fald i antal observationer)
gdf_utm = gdf_utm[gdf_utm['institutionstype2'] == 'Grundskoler']

In [34]:
# Læs valgdistrikter og tilføj centroid
valg_distrikt = gpd.read_file(cwd+'\scrape_geodata\geodata\dagi_10m_nohist_l1.afstemningsomraade\\afstemningsomraade.shp', driver = 'ESRI Shapefile')
valg_distrikt['centroid']=valg_distrikt['geometry'].centroid

In [35]:
valg_geo = valg_distrikt.set_geometry('centroid')
schools_geo = gdf_utm.geometry.unary_union

In [36]:
# Find for hver centroid den nærmeste skole
# For valg_distrikt['c½entroid'] find nærmeste gdf_utm['geometry'] ogtilføj tilhørende 'institutionsnummer'
pts3 = gdf_utm.geometry.unary_union
def near(point, pts = pts3):
    nearest = gdf_utm.geometry == nearest_points(point, pts)[1]
    return gdf_utm[nearest]['institutionsnummer'].values[0]
    
valg_geo['institutionsnummer'] = valg_geo.apply(lambda row: near(row.geometry), axis = 1)

In [26]:
"""test_valg = valg_geo.iloc[0].centroid

nearest = nearest_points(test_valg, schools_geo)[1]
print(test_valg)
print(nearest)
gdf_utm[gdf_utm['geometry'] == nearest]"""

"test_valg = valg_geo.iloc[0].centroid\n\nnearest = nearest_points(test_valg, schools_geo)[1]\nprint(test_valg)\nprint(nearest)\ngdf_utm[gdf_utm['geometry'] == nearest]"

In [37]:
# merge election to school in 'intitutionsnummer
merge1 = pd.merge(left=valg_geo, right = gdf_utm, on = 'institutionsnummer')
merge1.columns

Index(['objectid', 'id_lokalid', 'id_namespa', 'status', 'geometrist',
       'virkningfr', 'virkningti', 'virkningsa', 'forretning', 'registreri',
       'registre00', 'registre01', 'forretni00', 'forretni01', 'dagiid',
       'navn_x', 'redigering', 'dataspecif', 'landekode', 'skala',
       'afstemning', 'afstemni00', 'afstemni01', 'kommunekod', 'kommunelok',
       'opstilling', 'opstilli00', 'udtraeksda', 'gmlid', 'geometry_x',
       'centroid', 'institutionsnummer', 'karakter', 'soc_ref',
       'beliggenhedskommunenr', 'lat', 'lon', 'postnr', 'navn_y',
       'institutionstype2', 'institutionstype3', 'coordinates', 'year',
       'salgspris', 'geometry_y'],
      dtype='object')

In [38]:
# merge merge1 to mapping on 'objectid'
mapping = pd.read_csv(cwd+'/scrape_geodata/mapping_geodata_v3.csv', sep =';')
mapping.objectid = mapping.objectid.astype(str)
merge2 = pd.merge(left = merge1, right = mapping[['objectid', 'refs']], on = 'objectid')

In [39]:
# merge to election results
election_result = pd.read_csv(cwd+'/scrape_geodata/data2019.csv')

merge3 = pd.merge(left = merge2, right = election_result, on = 'refs')
merge3.kommunekod = merge3.kommunekod.astype(int).astype(str)

In [40]:
# merge to background vars on kommune level
background = pd.read_csv('kom_background_vars2019.csv', index_col = 0)

background.KOMKODE = background.KOMKODE.astype(str)
background

all_2019 = pd.merge(left = merge3, right = background, left_on = 'kommunekod', right_on = 'KOMKODE')

# Not necessary, and the file is very big, but can be used as checkpoint
# all_2019.to_csv('all_2019.csv')


In [42]:
# all_2019 = pd.read_csv('all_2019.csv)
all_2019.columns = all_2019.columns.str.lower()

col_vars = ['objectid','navn_x', 'afstemni00','kommunekod', 'institutionsnummer', 'karakter', 'soc_ref', 'postnr', 'navn_y', 'institutionstype2', 'institutionstype3', 'year_x', 'salgspris', 'party', 'votes', 'eligible_count', 'blank', 'invalid', 'total_valid', 'total_casted', 'time_stamp', 'names', 'constituency', 'voting_area', 'municipality', 'disindk', 'kriminelitet', 'lavindk', 'skilsmisser', 'andel_indv', 'geometry_x']

#all_2019[all_2019['refs'] == 'F809746006.htm'][['party', 'votes']]

In [43]:
temp_2019 = all_2019[col_vars]

In [47]:
left_parties = ['Socialdemokratiet', 'Radikale Venstre', 'SF - Socialistisk Folkeparti', 'Enhedslisten - De Rød-Grønne', 'Alternativet']
right_parties = ['Det Konservative Folkeparti', 'Nye Borgerlige', 'Klaus Riskær Pedersen', 'Liberal Alliance', 'Kristendemokraterne', 'Dansk Folkeparti', 'Stram Kurs', 'Venstre, Danmarks Liberale Parti']

left_dummy = [int(party in left_parties) for party in temp_2019['party']]
right_dummy = [int(party in right_parties) for party in temp_2019['party']]
independent_dummy = [int(party not in left_parties+right_parties) for party in temp_2019['party']]

temp_2019['left_dummy'], temp_2019['right_dummy'], temp_2019['independent_dummy'] = left_dummy, right_dummy, independent_dummy

temp_2019['left_votes'], temp_2019['right_votes'], temp_2019['independent_votes'] = left_dummy*temp_2019['votes'], right_dummy*temp_2019['votes'], independent_dummy*temp_2019['votes']

votes_df = temp_2019[['objectid', 'votes', 'left_votes', 'right_votes', 'independent_votes']].groupby(by = 'objectid').sum()

In [49]:
pre_merge = temp_2019.drop(['votes','left_votes', 'right_votes', 'independent_votes', 'left_dummy', 'right_dummy', 'independent_dummy', 'party'], axis = 1)
finalish_df = pd.merge(left = pre_merge, right = votes_df, on = 'objectid')\
            .drop_duplicates(subset = 'objectid')\
            .reset_index(drop = True)

In [52]:
finalish_df['left_share'] = finalish_df['left_votes']/finalish_df['total_valid']
finalish_df['right_share'] = finalish_df['right_votes']/finalish_df['total_valid']
finalish_df['independent_share'] = finalish_df['independent_votes']/finalish_df['total_valid']

In [56]:
final_df = finalish_df[['objectid', 'afstemni00','navn_x', 'kommunekod', 'municipality', 'constituency','postnr', 'time_stamp', 'votes', 'left_votes', 'right_votes', 'independent_votes', 'blank', 'invalid', 'left_share', 'right_share', 'independent_share', 'institutionsnummer', 'navn_y', 'institutionstype2', 'institutionstype3', 'karakter', 'soc_ref', 'salgspris', 'disindk', 'kriminelitet', 'lavindk','skilsmisser', 'andel_indv', 'geometry_x']]

In [57]:
final_df = final_df.rename(columns = {'navn_x': 'afstem_navn2', 'afstemni00': 'afstem_navn1', 'soc_ref': 'karakter_soc_ref', 'navn_y': 'skole_navn', 'blank': 'blank_votes', 'invalid': 'invalid_votes', 'votes': 'votes', 'municipality': 'kommunenavn', 'constituency': 'opst_kreds', 'geometry_x':'geometry'})
final_df.to_csv('final_data2019.csv', encoding = 'utf-8-sig')