In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

In [2]:
# Read the datasets by defining the path
commuter_path = 'raw_data/commuter.csv'
reachability_path = 'raw_data/reachability.csv'
tourism_path = 'raw_data/tourism.csv'

commuter_data = pd.read_csv('raw_data/commuter.csv', encoding='latin_1')
reachability_data = pd.read_csv('raw_data/reachability.csv', encoding='latin_1')
tourism_data = pd.read_csv('raw_data/tourism.csv', encoding='latin_1')

## Commuter Data Fixing

The date columns can be dropped towards the end. 

In [3]:
# Columns in the data
commuter_data.columns

Index(['ags2', 'bundesland', 'ags5', 'kreis', 'kr_ep', 'kr_ap', 'kr_sp',
       'kr_ep_svb', 'kr_ap_svb', 'kr_sp_svb', 'kr_pen_50', 'kr_pen_150',
       'kr_pen_300', 'kr_pen_202001', 'kr_pen_202002', 'kr_pen_202003',
       'kr_pen_202004', 'kr_pen_202005', 'kr_pen_202006', 'kr_pen_202007',
       'kr_pen_202008', 'kr_pen_202009', 'kr_pen_202010', 'kr_pen_202011'],
      dtype='object')

In [4]:
commuter_data.head()

Unnamed: 0,ags2,bundesland,ags5,kreis,kr_ep,kr_ap,kr_sp,kr_ep_svb,kr_ap_svb,kr_sp_svb,...,kr_pen_202002,kr_pen_202003,kr_pen_202004,kr_pen_202005,kr_pen_202006,kr_pen_202007,kr_pen_202008,kr_pen_202009,kr_pen_202010,kr_pen_202011
0,1,Schleswig-Holstein,1001,"Flensburg, Stadt",23363,11096,12267,52.4,34.2,27.7,...,9,-16,-42,-34,8,9,18,13,-3,-8
1,1,Schleswig-Holstein,1002,"Kiel, Landeshauptstadt",61487,27950,33537,49.1,30.3,27.1,...,2,-17,-44,-35,-7,-11,0,3,-13,-10
2,1,Schleswig-Holstein,1003,"LÃ¼beck, Hansestadt",43805,24823,18982,43.6,31.3,17.9,...,6,-12,-36,-28,3,-5,5,6,-9,-7
3,1,Schleswig-Holstein,1004,"NeumÃ¼nster, Stadt",22314,11699,10615,54.5,38.6,25.9,...,6,-10,-33,-28,1,-8,2,2,-8,2
4,1,Schleswig-Holstein,1051,Dithmarschen,6825,12991,6166,70.8,73.5,-15.8,...,-4,-7,-31,-19,27,19,38,39,20,16


In [5]:
commuter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ags2           401 non-null    int64  
 1   bundesland     401 non-null    object 
 2   ags5           401 non-null    int64  
 3   kreis          401 non-null    object 
 4   kr_ep          401 non-null    int64  
 5   kr_ap          401 non-null    int64  
 6   kr_sp          401 non-null    int64  
 7   kr_ep_svb      401 non-null    float64
 8   kr_ap_svb      401 non-null    float64
 9   kr_sp_svb      401 non-null    float64
 10  kr_pen_50      401 non-null    float64
 11  kr_pen_150     401 non-null    float64
 12  kr_pen_300     401 non-null    float64
 13  kr_pen_202001  401 non-null    int64  
 14  kr_pen_202002  401 non-null    int64  
 15  kr_pen_202003  401 non-null    int64  
 16  kr_pen_202004  401 non-null    int64  
 17  kr_pen_202005  401 non-null    int64  
 18  kr_pen_202

In [6]:
# Check for missing values
commuter_data.isnull().sum()

ags2             0
bundesland       0
ags5             0
kreis            0
kr_ep            0
kr_ap            0
kr_sp            0
kr_ep_svb        0
kr_ap_svb        0
kr_sp_svb        0
kr_pen_50        0
kr_pen_150       0
kr_pen_300       0
kr_pen_202001    0
kr_pen_202002    0
kr_pen_202003    0
kr_pen_202004    0
kr_pen_202005    0
kr_pen_202006    0
kr_pen_202007    0
kr_pen_202008    0
kr_pen_202009    0
kr_pen_202010    0
kr_pen_202011    0
dtype: int64

In [7]:
# List of columns to be dropped
to_be_dropped = ['bundesland', 'kr_pen_202001', 'kr_pen_202002', 'kr_pen_202003',
       'kr_pen_202004', 'kr_pen_202005', 'kr_pen_202006', 'kr_pen_202007',
       'kr_pen_202008', 'kr_pen_202009', 'kr_pen_202010', 'kr_pen_202011']
# Drop columns 
commuter_data.drop(to_be_dropped, axis=1, inplace=True)

In [16]:
# Column Name Replacement
commuter_data.rename(columns={
    "kr_ep":"Number_of_Commuters_place_of_work", 
    "kr_ap":"Number_of_Commuters_on_place_of_residence",
    "kr_sp":"Commuter_Balance",
    "kr_ep_svb": "Proportion_of_in_commuters",
    "kr_ap_svb": "Share_of_out_commuters",
    "kr_sp_svb": "Relative Commuter Balance", 
    "kr_pen_50": "Commute_within_50km", 
    "kr_pen_150": "Commute_within_150km", 
    "kr_pen_300": "Commute_within_300km", 
}, inplace=True)

In [17]:
commuter_data.to_csv('commuter_data_modified.csv', index=False, encoding='latin_1')

## Reachibility Data

In [18]:
reachability_data.head()

Unnamed: 0,id,ags2,ags5,kreis,supermarkets_population,supermarkets_average_distance,public_transport availability,average_distance_bus_stop,average_distance_train_station,average_distance_public_transport
0,1,1,1001,"Flensburg, Stadt",92,500,35,240,2901,240
1,2,1,1002,"Kiel, Landeshauptstadt",92,460,37,268,2037,265
2,3,1,1003,"LÃ¼beck, Hansestadt",90,532,37,297,1927,294
3,4,1,1004,"NeumÃ¼nster, Stadt",85,588,37,316,1648,313
4,5,1,1051,Dithmarschen,51,1864,35,448,3517,443


In [19]:
reachability_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   id                                 401 non-null    int64 
 1   ags2                               401 non-null    int64 
 2   ags5                               401 non-null    int64 
 3   kreis                              401 non-null    object
 4   supermarkets_population            401 non-null    int64 
 5   supermarkets_average_distance      401 non-null    int64 
 6   public_transport availability      401 non-null    int64 
 7   average_distance_bus_stop          401 non-null    int64 
 8   average_distance_train_station     401 non-null    int64 
 9   average_distance_public_transport  401 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 31.5+ KB


In [20]:
# Check null values 
reachability_data.isnull().sum()

id                                   0
ags2                                 0
ags5                                 0
kreis                                0
supermarkets_population              0
supermarkets_average_distance        0
public_transport availability        0
average_distance_bus_stop            0
average_distance_train_station       0
average_distance_public_transport    0
dtype: int64

In [21]:
to_be_dropped = ["bundesland", "kr_opnv_d", "kr_opnv_b", "kr_dist_ustrab"]
reachability_data.drop(to_be_dropped, axis=1, inplace=True)

KeyError: "['bundesland' 'kr_opnv_d' 'kr_opnv_b' 'kr_dist_ustrab'] not found in axis"

In [14]:
# Rename columns 
reachability_data.rename(columns={
#     "kr_opnv_d":"public_transport_average_distance", 
#     "kr_opnv_b":"public_transport_population", 
    "kr_sm_d": "supermarkets_average_distance", 
    "kr_sm_b": "supermarkets_population", 
    "kr_opnv_idx": "public_transport availability",
    "kr_dist_bush": "average_distance_bus_stop",
    "kr_dist_bhf": "average_distance_train_station",
    "kr_dist_opnv": "average_distance_public_transport",
    # "kr_dist_ustrab": "average_distance_subway",
}, inplace=True)

In [15]:
reachability_data.to_csv('reachability_data_modified.csv', index=False, encoding='latin_1')

## Tourism Data

Weird thing is that kr_to_gue is less than kr_to_ga in every column. Guests staying overnight more than total guests staying? 

In [22]:
tourism_data.head()

Unnamed: 0,_id,ags2,bundesland,ags5,kreis,kr_to_be,kr_to_bett,kr_to_ga,kr_to_gue
0,1,1,Schleswig-Holstein,1001,"Flensburg, Stadt",21,1761,188390,322191
1,2,1,Schleswig-Holstein,1002,"Kiel, Landeshauptstadt",47,4763,385648,805038
2,3,1,Schleswig-Holstein,1003,"LÃ¼beck, Hansestadt",103,10626,754190,1825115
3,4,1,Schleswig-Holstein,1004,"NeumÃ¼nster, Stadt",12,1097,93659,187823
4,5,1,Schleswig-Holstein,1051,Dithmarschen,326,11221,350821,1685871


In [23]:
tourism_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   _id         401 non-null    int64 
 1   ags2        401 non-null    int64 
 2   bundesland  401 non-null    object
 3   ags5        401 non-null    int64 
 4   kreis       401 non-null    object
 5   kr_to_be    401 non-null    int64 
 6   kr_to_bett  401 non-null    int64 
 7   kr_to_ga    401 non-null    int64 
 8   kr_to_gue   401 non-null    int64 
dtypes: int64(7), object(2)
memory usage: 28.3+ KB


In [24]:
tourism_data.isnull().sum()

_id           0
ags2          0
bundesland    0
ags5          0
kreis         0
kr_to_be      0
kr_to_bett    0
kr_to_ga      0
kr_to_gue     0
dtype: int64

In [25]:
tourism_data.drop(['bundesland'], axis=1, inplace=True)

In [26]:
# Rename the columns
tourism_data.rename(columns={
    "kr_to_be": "number_accomodation_establishments", 
    "kr_to_bett": "number_of_beds",
    "kr_to_ga": "number_of_guests",
    "kr_to_gue": "guests_staying_overnight"
}, inplace=True)

In [27]:
tourism_data.to_csv('tourism_data_modified.csv', encoding='latin_1', index=False)