In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 10)
pd.set_option('display.max_rows', 10)

In [None]:
d = pd.read_csv('D:/B3_Lokal-Datensatzpaket/CSV/MiD2017_Lokal_Wege.csv', sep=';', usecols = ['H_ID_Lok', 'P_ID', 'W_ID', 'W_GEW', 'W_DETAIL', 'hwzweck1', 'wegkm', 'W_VM_B', 'W_VM_C', 'W_VM_H', 'hvm', 'weg_intermod', 'vm_kombi', 'H_CS', 'alter_gr', 'HP_SEX', 'P_BIL', 'oek_status', 'hhgr_gr2', 'vpedrad', 'P_VAUTO', 'P_STKFZ', 'P_FKARTE', 'saison', 'PRAEZISION', 'GITTER_1km', 'GITTER_SO_500m', 'GITTER_SO_1km', 'PRAEZISION_SO', 'GITTER_ZO_500m', 'GITTER_ZO_1km', 'PRAEZISION_ZO'])
d

In [None]:
# number of unique trips, persons, and households before data processing
print("HH in the dataset: " + str(d['H_ID_Lok'].nunique()))
print("Persons in the dataset:", d[['H_ID_Lok', 'P_ID']].drop_duplicates().shape[0])
print('Trips in the dataset: ',len(d))

In [None]:
# remove trips with length > 100 (also includes missing/unplausible lengths)
# loses around 10% of trips
d['wegkm'] = d['wegkm'].str.replace(',','.')
d['wegkm'] = d['wegkm'].astype(float)
d = d[d.wegkm <= 100]
# take ln of trip length
d['dist_ln'] = np.log(d['wegkm'])
d['dist_ln'] = d['dist_ln'].apply(lambda x: 0 if x < 0 else x)

In [None]:
#Create mode choice variable from "hvm" and "W_VM_B" to differentiate between bicycle types
# 1: foot, 21: c-bike, 22: e-bike, 3: car (passenger), 4: car (driver), 5: pt
def create_choice(row):
    if row['hvm'] == 2:
        if row['W_VM_B'] == 0:
            return 21
        else:
            return 22
    else:
        return row['hvm']
d['choice'] = d.apply(create_choice, axis=1)

In [None]:
#calulate mode share before processing to later readjust weights after removing trips with missing data
d['W_GEW'] = d['W_GEW'].str.replace(',', '.').astype(float)
avweight=d['W_GEW'].sum()/len(d)
print(avweight)
modeshare_before_processing_weighted = d.groupby('choice')['W_GEW'].sum() / d['W_GEW'].sum()
print(modeshare_before_processing_weighted)

In [None]:
# Remove trips that are missing 1km cell for start or destination.
d = d[(d['GITTER_SO_1km'].str.contains('1km')) & (d['GITTER_ZO_1km'].str.contains('1km'))]

In [None]:
#Throw out other missing values 
d = d[d.HP_SEX != 9]
d = d[d.alter_gr != 99]
d = d[d.P_BIL != 9]
d = d[d.hwzweck1 != 99]
d = d[d.hvm != 9]
d = d[d.hvm != 703]
d = d[d.vpedrad != 9]
d = d[d.vpedrad != 200]
d = d[d.P_FKARTE != 99]
d = d[d.P_FKARTE != 200]
d = d[d.P_VAUTO != 9]
d = d[d.P_VAUTO != 200]
d = d[d.P_STKFZ != 9]
d = d[d.P_STKFZ != 200]

In [None]:
# recode P_STKFZ to mean: 1 = access to a car (at least sometimes), 0 = no access
dict = {1 : 1, 2 : 1, 3: 0}
d=d.replace({"P_STKFZ": dict})

#recode P_FKARTE to ticket to mean 1 = 3,4, or 5; 0 = no fixed-rate ticket
dict = {1 : 0, 2 : 0, 3: 1, 4: 1, 5: 1, 6: 0, 7: 0}
d=d.replace({"P_FKARTE": dict})
d=d.rename(columns={"P_FKARTE": "ticket"})


In [None]:
#throw out trips where no car available but mode is card or carp
mask = (d.P_STKFZ == 0) & (d.hvm.isin([3, 4]))
d = d[~mask]

In [None]:
#create binary bicycle types availabilities
d['cbikeav'] = d['vpedrad'].apply(lambda x: 1 if x in [1, 3] else 0)
d['ebikeav'] = d['vpedrad'].apply(lambda x: 1 if x in [2, 3] else 0)
#throw out trips where cbikeav=0 but mode is cbike
d = d[~((d['cbikeav'] == 0) & (d['choice'].isin([21])))]
#throw out trips where ebikeav=0 but mode is ebike
d = d[~((d['ebikeav'] == 0) & (d['choice'].isin([22])))]

In [None]:
# Raumtyp, slope, pt departures and cycling infra coverage for origin and destination

# read data for all 1km cells in Germany
cells = pd.read_csv('C:/Users/arning.FUL/Desktop/11_Mode_Choice/1km_Raumtyp_Slope_ptqual_infra.csv', usecols=['cellname', 'Raumtyp', 'slope', 'departures', 'cycling_coverage'])

# write Raumtyp to trip data
d['Raumtyp_SO'] = d['GITTER_SO_1km'].map(cells.set_index('cellname')['Raumtyp'])
d['Raumtyp_ZO'] = d['GITTER_ZO_1km'].map(cells.set_index('cellname')['Raumtyp'])

# write Slope to trip data
d['slope_SO'] = d['GITTER_SO_1km'].map(cells.set_index('cellname')['slope'])
d['slope_ZO'] = d['GITTER_ZO_1km'].map(cells.set_index('cellname')['slope'])

# write number of stops (2km radius around cell center) to trip data
d['departures_SO'] = d['GITTER_SO_1km'].map(cells.set_index('cellname')['departures'])
d['departures_ZO'] = d['GITTER_ZO_1km'].map(cells.set_index('cellname')['departures'])

# write cycling infrastructure coverage to trip data
d['cyclinfra_SO'] = d['GITTER_SO_1km'].map(cells.set_index('cellname')['cycling_coverage'])
d['cyclinfra_ZO'] = d['GITTER_ZO_1km'].map(cells.set_index('cellname')['cycling_coverage'])

# throw out trips where origin or destination do not have a raumtyp (only 300 trips)
a = [11.0, 12.0, 21.0, 22.0]
d = d[d['Raumtyp_SO'].isin(a)]
d = d[d['Raumtyp_ZO'].isin(a)]

# throw out trips where slope is none (0 trips)
d = d.dropna(subset=['slope_SO'])
d = d.dropna(subset=['slope_ZO'])

# throw out trips where slope is none (only 95 trips)
d = d.dropna(subset=['cyclinfra_SO'])
d = d.dropna(subset=['cyclinfra_ZO'])

# for departures, set Nan to 0
d['departures_SO'] = d['departures_SO'].fillna(0)
d['departures_ZO'] = d['departures_ZO'].fillna(0)

In [None]:
# postprocessing pt departures
# replace 0 to be able to apply log
d['departures_SO'] = d['departures_SO'].replace({0:1})
d['departures_ZO'] = d['departures_ZO'].replace({0:1})

# apply natural logarithm
d['departures_SO_ln'] = np.log(d['departures_SO'])
d['departures_ZO_ln'] = np.log(d['departures_ZO'])

# replace negative values with 0
d.loc[d['departures_SO_ln'] < 0, 'departures_SO_ln'] = 0
d.loc[d['departures_ZO_ln'] < 0, 'departures_ZO_ln'] = 0

# testing showed same parameter for O and D, therefore take average of variable
d['departures_ln'] = (d['departures_SO_ln']+d['departures_ZO_ln'])/2
d['departures_ln'] = d['departures_ln'].round(decimals=2)
d['departures'] = (d['departures_SO']+d['departures_ZO'])/2 # for plotting
d = d.drop('departures_SO_ln', axis=1)
d = d.drop('departures_ZO_ln', axis=1)
d = d.drop('departures_SO', axis=1)
d = d.drop('departures_ZO', axis=1)

# postprocessing gradient
# compute max slope for O and D
def find_maximum_slope(row):
    return max(row['slope_SO'], row['slope_ZO'])
d['grad'] = d.apply(find_maximum_slope, axis=1)
d['grad'] = d['grad'].round(2)
d = d.drop('slope_SO', axis=1)
d = d.drop('slope_ZO', axis=1)

# postprocessing infra
# compute average cycling infra between O and D
d['cyclinfra'] = (d['cyclinfra_SO']+d['cyclinfra_ZO'])/2
d['cyclinfra'] = d['cyclinfra'].round(3)
d = d.drop('cyclinfra_SO', axis=1)
d = d.drop('cyclinfra_ZO', axis=1)

# where spatial typology for origin and destination is different, use the higher (more peripheral) value
d['Raumtyp_SO'] = d['Raumtyp_SO'].astype(int)
d['Raumtyp_ZO'] = d['Raumtyp_ZO'].astype(int)
def find_maximum(row):
    return max(row['Raumtyp_SO'], row['Raumtyp_ZO'])
d['spattyp'] = d.apply(find_maximum, axis=1)

In [None]:
#calculate new weights to adjust for distortions in mode share after data processing
modeshare_after_processing_weighted = d.groupby('choice')['W_GEW'].sum() / d['W_GEW'].sum()
print(modeshare_before_processing_weighted)
print(modeshare_after_processing_weighted)

In [None]:
#calculate reweighting per mode to adjust for newly introduced distortion
temp = {'mode': [1,3,4,5,21,22], 'reweight':[modeshare_before_processing_weighted[1]/modeshare_after_processing_weighted[1],
                                           modeshare_before_processing_weighted[3]/modeshare_after_processing_weighted[3],
                                           modeshare_before_processing_weighted[4]/modeshare_after_processing_weighted[4],
                                           modeshare_before_processing_weighted[5]/modeshare_after_processing_weighted[5],
                                           modeshare_before_processing_weighted[21]/modeshare_after_processing_weighted[21],
                                           modeshare_before_processing_weighted[22]/modeshare_after_processing_weighted[22]]}
# Create a dictionary mapping values in 'mode' to corresponding 'reweight' values
reweighting = pd.DataFrame(data=temp)
reweighting = reweighting.set_index('mode')
reweight_dict = reweighting['reweight'].to_dict()
d['W_GEW_new'] = d['choice'].map(reweight_dict) * d['W_GEW']
#readjust new weights to an average of 1
d['W_GEW_new'] = d['W_GEW_new']/d[['W_GEW_new']].mean()[0]
#reweighting.to_csv('E:/PLUGIN/11_Mode_Choice/mode-reweighting.csv')

In [None]:
# Dummify variables

# mobility tool availabilities are already coded as three binary dummy variables

# trip purpose
dummies_purp = pd.get_dummies(d['hwzweck1'], prefix='purp', dtype=int)
dummies_purp = dummies_purp.rename(columns=lambda x: x.replace('purp', ''))
dummies_purp.columns = ['purp' + str(col) for col in dummies_purp.columns]
d = d.join(dummies_purp)

# Spatial typology
dummies_spattyp = pd.get_dummies(d['spattyp'], prefix='spattyp', dtype=int)
dummies_spattyp = dummies_spattyp.rename(columns=lambda x: x.replace('spattyp', ''))
dummies_spattyp.columns = ['spattyp' + str(col) for col in dummies_spattyp.columns]
d = d.join(dummies_spattyp)

# season
dummies_season = pd.get_dummies(d['saison'], prefix='season', dtype=int)
dummies_season = dummies_season.rename(columns=lambda x: x.replace('season', ''))
dummies_season.columns = ['season' + str(col) for col in dummies_season.columns]
d = d.join(dummies_season)

#create binary dummy variables for age, education, gender, hh economic status, hh size
dummies_age = pd.get_dummies(d['alter_gr'], prefix='age', dtype=int)
dummies_age = dummies_age.rename(columns=lambda x: x.replace('age', ''))
dummies_age.columns = ['age' + str(col) for col in dummies_age.columns]
d = d.join(dummies_age)

dummies_sex = pd.get_dummies(d['HP_SEX'], prefix='sex', dtype=int)
dummies_sex = dummies_sex.rename(columns=lambda x: x.replace('sex', ''))
dummies_sex.columns = ['sex' + str(col) for col in dummies_sex.columns]
d = d.join(dummies_sex)

dummies_edu = pd.get_dummies(d['P_BIL'], prefix='edu', dtype=int)
dummies_edu = dummies_edu.rename(columns=lambda x: x.replace('edu', ''))
dummies_edu.columns = ['edu' + str(col) for col in dummies_edu.columns]
d = d.join(dummies_edu)

dummies_eco = pd.get_dummies(d['oek_status'], prefix='eco', dtype=int)
dummies_eco = dummies_eco.rename(columns=lambda x: x.replace('eco', ''))
dummies_eco.columns = ['eco' + str(col) for col in dummies_eco.columns]
d = d.join(dummies_eco)

dummies_hhsize = pd.get_dummies(d['hhgr_gr2'], prefix='hhsize', dtype=int)
dummies_hhsize = dummies_hhsize.rename(columns=lambda x: x.replace('hhsize', ''))
dummies_hhsize.columns = ['hhsize' + str(col) for col in dummies_hhsize.columns]
d = d.join(dummies_hhsize)

# drop the original categorical variables
d = d.drop('HP_SEX', axis=1)
#d = d.drop('alter_gr', axis=1) keeping age, we use this later for analysis in model validation
d = d.rename(columns={"alter_gr": "age"})
d = d.drop('P_BIL', axis=1)
d = d.drop('hhgr_gr2', axis=1)
d = d.drop('oek_status', axis=1)
#d = d.drop('hwzweck1', axis=1) keeping trip purpose, we use this later for analysis in model validation
d = d.rename(columns={"hwzweck1": "purp"})
d = d.drop('saison', axis=1)
d = d.drop('Raumtyp_SO', axis=1)
d = d.drop('Raumtyp_ZO', axis=1)
d = d.drop('spattyp', axis=1)

In [None]:
# FULL SAMPLE
d.to_csv('F:/Processed Data/ModeChoice_input.csv')

# 100k SAMPLE #
d.sample(100000).to_csv('F:/Processed Data/ModeChoice_input_100000.csv')

# 10k SAMPLE #
d.sample(10000).to_csv('F:/Processed Data/ModeChoice_input_10000.csv')

In [None]:
# SAMPLES of all trips of households from within/outside a specific city (GITTER_1km is HH location)

# create list of 1km cells that are within the municipality. ags-to-cells-bridge can be found in Raumtyp.csv
cells_ags = pd.read_csv('E:/PLUGIN/11_Mode_Choice/Raumtyp.csv', usecols = ['cellname','ags'])
cells_ags

In [None]:
#Wuppertal
cells_wuppertal = cells_ags[cells_ags['ags'] == 5124000]
cells_wuppertal

d_inWT = d[d['GITTER_1km'].isin(cells_wuppertal['cellname'].tolist())]
indices_inWT = d_inWT.index
d_outWT = d[~d.index.isin(indices_inWT)]

# Convert to different type of int
columns_to_convert = [
    "purp_1", "purp_2", "purp_3", "purp_4", "purp_5", "purp_6", "purp_7",
    "spattyp_11", "spattyp_12", "spattyp_21", "spattyp_22", "season_1", "season_2", "season_3", "season_4",
    "age_1", "age_2", "age_3", "age_4", "age_5", "age_6", "age_7", "age_8",
    "sex_1", "sex_2", "edu_1", "edu_2", "edu_3", "edu_4", "edu_5", "edu_6",
    "eco_1", "eco_2", "eco_3", "eco_4", "eco_5"
]

d_outWT[columns_to_convert] = d_outWT[columns_to_convert].astype(int)
d_inWT[columns_to_convert] = d_inWT[columns_to_convert].astype(int)

# Save samples
d_inWT.to_csv('F:/Processed Data/cityvalidation/ModeChoice_sim_inWT.csv')
d_outWT.to_csv('F:/Processed Data/cityvalidation/ModeChoice_input_outWT.csv')

In [None]:
#Münster
cells_wuppertal = cells_ags[cells_ags['ags'] == 5515000]
cells_wuppertal

d_inMS = d[d['GITTER_1km'].isin(cells_wuppertal['cellname'].tolist())]
indices_inMS = d_inMS.index
d_outMS = d[~d.index.isin(indices_inMS)]

d_inMS[columns_to_convert] = d_inMS[columns_to_convert].astype(int)
d_outMS[columns_to_convert] = d_outMS[columns_to_convert].astype(int)

# Save samples
d_inMS.to_csv('F:/Processed Data/cityvalidation/ModeChoice_sim_inMS.csv')
d_outMS.to_csv('F:/Processed Data/cityvalidation/ModeChoice_input_outMS.csv')

In [None]:
# Samples k-fold cross validation (k=5)
d_shuffled = d.sample(frac = 1)

#simulation samples (one fifth of full sample)
np.array_split(d_shuffled, 5)[0].to_csv('F:/Processed Data/kfcv/ModeChoice_sim_k1.csv')
np.array_split(d_shuffled, 5)[1].to_csv('F:/Processed Data/kfcv/ModeChoice_sim_k2.csv')
np.array_split(d_shuffled, 5)[2].to_csv('F:/Processed Data/kfcv/ModeChoice_sim_k3.csv')
np.array_split(d_shuffled, 5)[3].to_csv('F:/Processed Data/kfcv/ModeChoice_sim_k4.csv')
np.array_split(d_shuffled, 5)[4].to_csv('F:/Processed Data/kfcv/ModeChoice_sim_k5.csv')

#estimation samles (four fifths of full sample)
pd.concat([np.array_split(d_shuffled, 5)[1],np.array_split(d_shuffled, 5)[2],np.array_split(d_shuffled, 5)[3],np.array_split(d_shuffled, 5)[4]]).to_csv('F:/Processed Data/kfcv/ModeChoice_input_k2345.csv')
pd.concat([np.array_split(d_shuffled, 5)[0],np.array_split(d_shuffled, 5)[2],np.array_split(d_shuffled, 5)[3],np.array_split(d_shuffled, 5)[4]]).to_csv('F:/Processed Data/kfcv/ModeChoice_input_k1345.csv')
pd.concat([np.array_split(d_shuffled, 5)[0],np.array_split(d_shuffled, 5)[1],np.array_split(d_shuffled, 5)[3],np.array_split(d_shuffled, 5)[4]]).to_csv('F:/Processed Data/kfcv/ModeChoice_input_k1245.csv')
pd.concat([np.array_split(d_shuffled, 5)[0],np.array_split(d_shuffled, 5)[1],np.array_split(d_shuffled, 5)[2],np.array_split(d_shuffled, 5)[4]]).to_csv('F:/Processed Data/kfcv/ModeChoice_input_k1235.csv')
pd.concat([np.array_split(d_shuffled, 5)[0],np.array_split(d_shuffled, 5)[1],np.array_split(d_shuffled, 5)[2],np.array_split(d_shuffled, 5)[3]]).to_csv('F:/Processed Data/kfcv/ModeChoice_input_k1234.csv')

In [None]:
#########################
### DESCRIPTIVE STATS ###
#########################

In [None]:
#categorical, unweighted
def unique_entry_share(df, columns):
    result = {}
    for column in columns:
        counts = df[column].value_counts(normalize=True)
        result[column] = counts
    return result

selected_columns = ['choice','cbikeav','ebikeav','P_STKFZ','ticket', 'age_1','age_2','age_3','age_4','age_5','age_6','age_7','age_8','sex_1','sex_2','edu_1','edu_2','edu_3','edu_4','edu_5','edu_6','eco_1','eco_2','eco_3','eco_4','eco_5','purp_1','purp_2','purp_3','purp_4','purp_5','purp_6','purp_7','season_1','season_2','season_3','season_4','spattyp_11','spattyp_12','spattyp_21','spattyp_22']
unique_entry_shares = unique_entry_share(d, selected_columns)

for column, counts in unique_entry_shares.items():
    print("Unique entry share for column '{}':".format(column))
    print(counts)
    print()

In [None]:
#categorical, weighted
def weighted_unique_entry_share(df, columns, weight_column):
    result = {}
    for column in columns:
        counts = df.groupby(column)[weight_column].sum() / df[weight_column].sum()
        result[column] = counts
    return result

weight_column = 'W_GEW_new'  
weighted_unique_entry_shares = weighted_unique_entry_share(d, selected_columns, weight_column)

for column, counts in weighted_unique_entry_shares.items():
    print("Weighted unique entry share for column '{}':".format(column))
    print(counts)
    print()

In [None]:
#continuous, unweighted
import matplotlib.pyplot as plt
medianprops = {'color': 'black'}

print("distance")
fig = plt.figure(figsize=(5, 2), dpi=1000)
plt.rc('font', family='Arial', size=10)
#plt.boxplot(d["wegkm"], vert=False, showfliers=False)
bp = plt.boxplot(d["wegkm"], vert=False, showfliers=True, medianprops=medianprops, widths=0.25)
plt.yticks([1], ['distance\n[km]'])
plt.savefig("dist_boxplot.svg", format='svg')
plt.show()
print("mean:"+ str(d['wegkm'].mean()))
print("weighted mean:"+ str(np.average(d['wegkm'], weights=d['W_GEW_new'])))

print("gradient")
fig = plt.figure(figsize=(5, 2), dpi=1000)
plt.rc('font', family='Arial', size=10)
#plt.boxplot(d["slope_max"], vert=False, showfliers=False)
bp = plt.boxplot(d["grad"], vert=False, showfliers=True, medianprops=medianprops, widths=0.25)
plt.yticks([1], ['gradient\n[%]'])
plt.savefig("grad_boxplot.svg", format='svg')
plt.show()
print("mean:"+ str(d['grad'].mean()))
print("weighted mean:"+ str(np.average(d['grad'], weights=d['W_GEW_new'])))

print("bic. infra.")
fig = plt.figure(figsize=(5, 2), dpi=1000)
plt.rc('font', family='Arial', size=10)
#plt.boxplot(d["cyclinfra_SOZO"], vert=False, showfliers=False)
bp = plt.boxplot(d["cyclinfra"], vert=False, showfliers=True, medianprops=medianprops, widths=0.25)
plt.savefig("infra_boxplot.svg", format='svg')
plt.yticks([1], ['bic. infra.'])
plt.savefig("infra_boxplot.svg", format='svg')
plt.show()
print("mean:"+ str(d['cyclinfra'].mean()))
print("weighted mean:"+ str(np.average(d['cyclinfra'], weights=d['W_GEW_new'])))

print("departures")
fig = plt.figure(figsize=(5, 2), dpi=1000)
plt.rc('font', family='Arial', size=10)
#plt.boxplot(d["depart_ln"], vert=False, showfliers=False)
bp = plt.boxplot(d["departures"], vert=False, showfliers=True, medianprops=medianprops, widths=0.25)
plt.yticks([1], ['departures'])
plt.savefig("depart_boxplot.svg", format='svg')
plt.show()
print("mean:"+ str(d['departures'].mean()))
print("weighted mean:"+ str(np.average(d['departures'], weights=d['W_GEW_new'])))

In [None]:
#share of trips undertaken by 60+ for leisure purpose among all e-bike trips
len(
    d[
        (
            (d['choice']==22)
            &(d['hwzweck1_6']==1)
            &((d['alter_gr_6'] == 1) | (d['alter_gr_7'] == 1) | (d['alter_gr_8'] == 1))
        )
    ]
)/len(
    d[(d['choice']==22)]
)

In [None]:
#share of e-bike trips where the person does not own a c-bike
len(
    d[(d['choice']==22) & (d['cbikeav']==0)]
)/len(
    d[d['choice']==22]
)

In [None]:
# number of unique trips, persons, and households after data processing
print("HH in the dataset: " + str(d['H_ID_Lok'].nunique()))
print("Persons in the dataset:", d[['H_ID_Lok', 'P_ID']].drop_duplicates().shape[0])
print('Trips in the dataset: ',len(d))

In [None]:
#Correlation matrix between variables
#dummify choice for correlation matrix
dummies_choice = pd.get_dummies(d['choice'], prefix='choice')
dummies_choice = dummies_choice.rename(columns=lambda x: x.replace('choice', ''))
dummies_choice.columns = ['choice' + str(col) for col in dummies_choice.columns]
d = d.join(dummies_choice)

In [None]:
columns_to_correlate = ['choice_1','choice_21','choice_22','choice_3','choice_4','choice_5','P_STKFZ','ticket','grad','cyclinfra','departures_ln','spattyp_11','spattyp_21','spattyp_22','purp_3','age_1','age_2','age_7','edu_1','edu_4','edu_5','eco_3','eco_5']
# Calculate the correlation matrix (categorical variables are looked at as continous here! Just supposed to give a first overview!)
correlation_matrix = d[columns_to_correlate].corr()

import seaborn as sns
import matplotlib.pyplot as plt

custom_labels = ['choice_foot', 'choice_cbike', 'choice_ebike', 'choice_carp', 'choice_card', 'choice_pt', 'MT_car', 'MT_ticket', 'grad', 'infra', 'depart_O', 'depart_D', 'spatyp_11', 'spatyp_21', 'spatyp_22', 'purp_3', 'age_1', 'age_2', 'age_7', 'edu_1', 'edu_4', 'edu_5', 'eco_3', 'eco_5']

plt.figure(figsize=(25, 20))
ax = sns.heatmap(correlation_matrix, square = True, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 14}, vmin=-1, vmax=1, xticklabels=custom_labels, yticklabels=custom_labels, cbar = False)
#cbar_ax = ax.figure.add_axes([0.1, -0.05, 0.8, 0.05])  # Adjust the position and size of the color bar as needed
#cbar = ax.figure.colorbar(ax.collections[0], cax=cbar_ax, orientation='horizontal')
ax.tick_params(axis='both', which='major', labelsize=14, bottom=False, labelbottom=False, top=True, labeltop=True)  # Adjust the text size as needed
plt.xticks(rotation=45)
plt.savefig("corr_matrix.svg", format='svg')
plt.show()

In [None]:
############
### MAPS ###
############
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import pandas as pd
import numpy as np

In [None]:
#spatyp
df = pd.read_csv('../1km_Raumtyp_Slope.csv', usecols = ['cellname','Raumtyp'])

df['x'] = '0'
df['y'] = '0'

for index, row in df.iterrows():
    df.at[index, 'x'] = int(row['cellname'][9:13])
    df.at[index, 'y'] = int(row['cellname'][4:8])

df = df[df['Raumtyp'].isin([11.0,12.0,21.0,22.0])]

df['Raumtyp'] = df['Raumtyp'].astype(int).astype(str)
df = df.drop(['cellname'], axis=1)

df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])
#df

In [None]:
heatmap_data = df.pivot(index='y', columns='x', values='Raumtyp')

category_colors = {
    11: 'lightcoral',
    12: 'pink',
    21: 'teal',
    22: 'palegreen'
}

# Create a scatter plot for each category with its assigned color
plt.figure(figsize=(20, 22))

plt.rc('font', family='Arial', size=38)
for category, color in category_colors.items():
    subset = df[df['Raumtyp'] == str(category)]
    plt.scatter(subset['x'], subset['y'], c=color, marker='s', s=0.6, label=f'{category}')

#legend
h1 = Line2D([0], [0], marker='s', markersize=np.sqrt(20), color='lightcoral', linestyle='None')
h2 = Line2D([0], [0], marker='s', markersize=np.sqrt(20), color='pink', linestyle='None')
h3 = Line2D([0], [0], marker='s', markersize=np.sqrt(20), color='teal', linestyle='None')
h4 = Line2D([0], [0], marker='s', markersize=np.sqrt(20), color='palegreen', linestyle='None')
plt.legend([h1, h2, h3, h4], ['urban metropolitan','urban regiopolitan','rural close to city', 'rural peripheral'], markerscale=5, scatterpoints=1, title="spatial typology")

#plt.xlabel('X')
#plt.ylabel('Y')
plt.xticks([])
plt.yticks([])

plt.gca().set_aspect('equal', adjustable='box')
plt.savefig("spatyp_map.jpg", format='jpg')
plt.show()

In [None]:
#departures
df = pd.read_csv('../1km_Raumtyp_Slope_ptqual.csv', usecols = ['cellname','departures'])

df['x'] = '0'
df['y'] = '0'

for index, row in df.iterrows():
    df.at[index, 'x'] = int(row['cellname'][9:13])
    df.at[index, 'y'] = int(row['cellname'][4:8])

    
df = df.drop(['cellname'], axis=1)

df = df.dropna()

df['departures_p'] = np.log(df['departures'])
df['departures_p'] = df['departures_p'].apply(lambda x: max(x, 0))

df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])
df['departures'] = df['departures'].astype(int)

In [None]:
norm = plt.Normalize(vmin=df['departures_p'].min(), vmax=df['departures_p'].max())

plt.figure(figsize=(20, 22))

plt.rc('font', family='Arial', size=38)

scatter = plt.scatter(df['x'], df['y'], c=df['departures_p'], marker='s', s=0.6, cmap='viridis', norm=norm)

cbar = plt.colorbar(scatter)
cbar.set_label('logarithmized public transport departures')

#plt.xlabel('X')
#plt.ylabel('Y')
plt.xticks([])
plt.yticks([])

plt.gca().set_aspect('equal', adjustable='box')
plt.savefig("departures_map.jpg", format='jpg')
plt.show()

In [None]:
#gradient
df = pd.read_csv('../1km_Raumtyp_Slope.csv', usecols = ['cellname','slope'])

df['x'] = '0'
df['y'] = '0'

for index, row in df.iterrows():
    df.at[index, 'x'] = int(row['cellname'][9:13])
    df.at[index, 'y'] = int(row['cellname'][4:8])

df = df.drop(['cellname'], axis=1)

df = df.dropna()

df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])

In [None]:
norm = plt.Normalize(vmin=df['slope'].min(), vmax=20)

plt.figure(figsize=(20, 22))

plt.rc('font', family='Arial', size=38)

scatter = plt.scatter(df['x'], df['y'], c=df['slope'], marker='s', s=0.6, cmap='Greys', norm=norm)

cbar = plt.colorbar(scatter)
cbar.set_label('terrain gradient')

#plt.xlabel('X')
#plt.ylabel('Y')
plt.xticks([])
plt.yticks([])

plt.gca().set_aspect('equal', adjustable='box')
plt.savefig("gradient_map.jpg", format='jpg')

plt.show()

In [None]:
#infrastructure
df = pd.read_csv('../1km_Raumtyp_Slope_ptqual_infra.csv', usecols = ['cellname','cycling_coverage'])

df['x'] = '0'
df['y'] = '0'

for index, row in df.iterrows():
    df.at[index, 'x'] = int(row['cellname'][9:13])
    df.at[index, 'y'] = int(row['cellname'][4:8])

df = df.drop(['cellname'], axis=1)

df = df.dropna()

df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])

In [None]:
norm = plt.Normalize(vmin=df['cycling_coverage'].min(), vmax=df['cycling_coverage'].max())

plt.figure(figsize=(20, 22))

plt.rc('font', family='Arial', size=38)

scatter = plt.scatter(df['x'], df['y'], c=df['cycling_coverage'], marker='s', s=0.6, cmap='viridis', norm=norm)

cbar = plt.colorbar(scatter)
cbar.set_label('bicycle infrastructure coverage')

#plt.xlabel('X')
#plt.ylabel('Y')
plt.xticks([])
plt.yticks([])

plt.gca().set_aspect('equal', adjustable='box')
plt.savefig("infra_map.jpg", format='jpg')
plt.show()