In [13]:
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point, Polygon
import geopandas as gpd
from shapely.ops import nearest_points
import pyproj


cwd =os.getcwd()

### Structure socioeconomic data by year

In [14]:
socioeco_df = pd.read_excel(cwd+'/socioeconomic_data/socioeco_data.xlsx', sheet_name = 'Samlet')

postnr_columns =['POSTNR', 'YEAR', 'KOMKODE', 'SALGSPRIS']
kom_columns = ['KOMKODE', 'YEAR', 'DISINDK', 'KRIMINELITET', 'LAVINDK', 'SKILSMISSER', 'ANDEL_INDV']

post_df = socioeco_df[postnr_columns]
kom_df = socioeco_df[kom_columns]

years = [2011, 2015, 2019]
post_split = [post_df[post_df['YEAR'] == year] for year in years]
kom_split = [kom_df[kom_df['YEAR'] == year] for year in years]

### Merge school data and socioeconomic data by postal code and municipality

In [15]:
#school_data[school_data['beliggenhedskommunenr'] == 217]

NameError: name 'school_data' is not defined

In [21]:
df_list = []
for data, year in zip(post_split, [2011, 2015, 2019]):
    # load data
    soc_data = data
    school_data = pd.read_csv(cwd+f'/school_data/school_clean_{year-1}-{year}.csv', index_col=0)

    # merge data
    merged = pd.merge(left = school_data, right = soc_data, left_on = ['beliggenhedskommunenr', 'postnr'], right_on = ['KOMKODE', 'POSTNR'])

    # format
    merged.columns = merged.columns.str.lower()
    merged = merged.T.drop_duplicates().T

    merged.to_csv(f'postnr_background_vars{year}.csv')

    df_list.append(merged)

for data, year in zip(kom_split, [2011, 2015, 2019]):
    data.drop_duplicates()\
        .to_csv(f'kom_background_vars{year}.csv')

### Merge to closest school within same municipality (2019)

In [22]:
# Project geographical background variales from lat-lon to utm
df = pd.read_csv('postnr_background_vars2019.csv', index_col=0)
c = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry = c)

gdf.crs = {'init': 'epsg:4326'}
gdf_utm = gdf.to_crs(epsg = 32632)

# Narrow down to primary schools only
gdf_utm = gdf_utm[gdf_utm['institutionstype2'] == 'Grundskoler']

In [24]:
# Load election data and find centroid for every district
valg_distrikt = gpd.read_file(cwd+'\scrape_geodata\geodata\dagi_10m_nohist_l1.afstemningsomraade\\afstemningsomraade.shp', driver = 'ESRI Shapefile')
valg_distrikt['centroid']=valg_distrikt['geometry'].centroid
valg_geo = valg_distrikt.set_geometry('centroid')

In [25]:
# Load municipality codes
kom_koder = list(pd.read_excel(cwd+'/school_data/data_download/kom_koder.xls', usecols = 'G:H', header = 3).iloc[:,0])
kom_koder.remove(411) # delete Christiansø

In [26]:
# Split into list of backgrounddata and election data by municipality code
kommune_split = [[gdf_utm[gdf_utm['beliggenhedskommunenr'] == kom_kode], valg_geo[valg_geo['kommunekod'].astype(int) == kom_kode]] for kom_kode in kom_koder]

# For each municipality, find closest school to each polling place.
def near(point, pts):
    """
    Finds closest among pts to point and saves the relevant institutionsnr
    """
    nearest = gdf_utm.geometry == nearest_points(point, pts)[1]
    return gdf_utm[nearest]['institutionsnummer'].values[0]

for gdf, valg_geo in kommune_split:
    pts = gdf.geometry.unary_union
    valg_geo['institutionsnummer'] = valg_geo.apply(lambda row: near(row.geometry, pts), axis = 1)

# concat school data
gdf_utm = pd.concat([kommune_split[i][0] for i in range(len(kommune_split))])

# concat election data
valg_geo = pd.concat([kommune_split[i][1] for i in range(len(kommune_split))])

# merge election to school in 'intitutionsnummer
merge1 = pd.merge(left=valg_geo, right = gdf_utm, on = 'institutionsnummer')

In [7]:
# merge merge1 to mapping on 'objectid'
mapping = pd.read_excel(cwd+'/scrape_geodata/mapping2019.xlsx')
mapping.objectid = mapping.objectid.astype(str)
merge2 = pd.merge(left = merge1, right = mapping[['objectid', 'refs']], on = 'objectid')

NameError: name 'merge1' is not defined

In [8]:
# merge to election results
election_result = pd.read_csv(cwd+'/scrape_geodata/data2019.csv')

merge3 = pd.merge(left = merge2, right = election_result, on = 'refs')
merge3.kommunekod = merge3.kommunekod.astype(int).astype(str)

NameError: name 'merge2' is not defined

In [9]:
# merge to background vars on kommune level
background = pd.read_csv('kom_background_vars2019.csv', index_col = 0)

background.KOMKODE = background.KOMKODE.astype(str)
background

all_2019 = pd.merge(left = merge3, right = background, left_on = 'kommunekod', right_on = 'KOMKODE')

# Not necessary, and the file is very big, but can be used as checkpoint
# all_2019.to_csv('all_2019.csv')


NameError: name 'merge3' is not defined

In [10]:
all_2019.columns = all_2019.columns.str.lower()
col_vars = ['objectid','navn_x', 'afstemni00','kommunekod', 'institutionsnummer', 'karakter', 'soc_ref', 'postnr', 'navn_y', 'institutionstype2', 'institutionstype3', 'year_x', 'salgspris', 'party', 'votes', 'eligible_count', 'blank', 'invalid', 'total_valid', 'total_casted', 'time_stamp', 'names', 'constituency', 'voting_area', 'municipality', 'disindk', 'kriminelitet', 'lavindk', 'skilsmisser', 'andel_indv', 'geometry_x']

temp_2019 = all_2019[col_vars]

NameError: name 'all_2019' is not defined

### Create variables for voting counts

In [11]:
left_parties = ['Socialdemokratiet', 'Radikale Venstre', 'SF - Socialistisk Folkeparti', 'Enhedslisten - De Rød-Grønne', 'Alternativet']
right_parties = ['Det Konservative Folkeparti', 'Nye Borgerlige', 'Klaus Riskær Pedersen', 'Liberal Alliance', 'Kristendemokraterne', 'Dansk Folkeparti', 'Stram Kurs', 'Venstre, Danmarks Liberale Parti']

left_dummy = [int(party in left_parties) for party in temp_2019['party']]
right_dummy = [int(party in right_parties) for party in temp_2019['party']]
independent_dummy = [int(party not in left_parties+right_parties) for party in temp_2019['party']]

temp_2019['left_dummy'], temp_2019['right_dummy'], temp_2019['independent_dummy'] = left_dummy, right_dummy, independent_dummy

temp_2019['left_votes'], temp_2019['right_votes'], temp_2019['independent_votes'] = left_dummy*temp_2019['votes'], right_dummy*temp_2019['votes'], independent_dummy*temp_2019['votes']

votes_df = temp_2019[['objectid', 'votes', 'left_votes', 'right_votes', 'independent_votes']].groupby(by = 'objectid').sum()

pre_merge = temp_2019.drop(['votes','left_votes', 'right_votes', 'independent_votes', 'left_dummy', 'right_dummy', 'independent_dummy', 'party'], axis = 1)
finalish_df = pd.merge(left = pre_merge, right = votes_df, on = 'objectid')\
            .drop_duplicates(subset = 'objectid')\
            .reset_index(drop = True)

finalish_df['left_share'] = finalish_df['left_votes']/finalish_df['total_valid']
finalish_df['right_share'] = finalish_df['right_votes']/finalish_df['total_valid']
finalish_df['independent_share'] = finalish_df['independent_votes']/finalish_df['total_valid']

NameError: name 'temp_2019' is not defined

In [12]:
final_gdf = gpd.GeoDataFrame(finalish_df[['objectid', 'afstemni00','navn_x', 'kommunekod', 'municipality', 'constituency','postnr', 'time_stamp', 'votes', 'eligible_count', 'total_casted', 'total_valid', 'left_votes', 'right_votes', 'independent_votes', 'blank', 'invalid', 'left_share', 'right_share', 'independent_share', 'institutionsnummer', 'navn_y', 'institutionstype2', 'institutionstype3', 'karakter', 'soc_ref', 'salgspris', 'disindk', 'kriminelitet', 'lavindk','skilsmisser', 'andel_indv', 'geometry_x']])

final_gdf = final_gdf.rename(columns = {'navn_x': 'afstem_navn2', 'afstemni00': 'afstem_navn1', 'soc_ref': 'karakter_soc_ref', 'navn_y': 'skole_navn', 'blank': 'blank_votes', 'invalid': 'invalid_votes', 'votes': 'votes', 'municipality': 'kommunenavn', 'constituency': 'opst_kreds', 'geometry_x':'geometry'})

final_gdf['area'] = final_gdf.geometry.area
final_gdf['pop_density'] = final_gdf['eligible_count']/final_gdf['area']

final_gdf.to_csv('final_data2019.csv', encoding = 'utf-8-sig')

NameError: name 'finalish_df' is not defined