In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
df = pd.read_csv("../data/output/berlin_accidents_maxspeed_03.csv", sep=",")

In [3]:
#osm base: na = 6
df.isna().sum()

OBJECTID        0
UMONAT          0
USTUNDE         0
UWOCHENTAG      0
UKATEGORIE      0
UART            0
UTYP1           0
ULICHTVERH      0
STRZUSTAND      0
IstPKW          0
IstKrad         0
IstGkfz         0
IstFuss         0
IstSonstige     0
maxspeed       33
fclass         33
oneway         33
osm_id         33
District        0
Street          0
PostalCode      0
bridge         33
tunnel         33
lat             0
lon             0
geometry       33
dtype: int64

In [4]:
df['street_unique'] = df['District'] + " " + df['Street'] + " " + df['PostalCode'].astype(str)

In [5]:
df['street_unique']

0           Niederschönhausen Dietzgenstraße 13156
1        Prenzlauer Berg Greifswalder Straße 10405
2           Friedrichshain Frankfurter Allee 10247
3                     Mitte Leipziger Straße 10117
4          Niederschöneweide Schnellerstraße 12439
                           ...                    
5187                             Neukölln na 12053
5188    Friedrichsfelde Rummelsburger Straße 10315
5189                       Charlottenburg na 10587
5190                 Weißensee Roelckestraße 13088
5191              Fennpfuhl Storkower Straße 10369
Name: street_unique, Length: 5192, dtype: object

In [6]:
#create the alone variable based on road users
def alone_crash(row):
    if row['IstPKW'] != 0: return 0               
    elif row['IstKrad'] !=0: return 0      
    elif row['IstGkfz'] !=0: return 0           
    elif row['IstFuss'] !=0: return 0
    elif row['IstSonstige'] !=0: return 0      
    else: return 1                        
df['alone'] = df.apply(alone_crash, axis=1)

In [7]:
#create the not_alone variable based on road users
def not_alone_crash(row):
    if row['IstPKW'] != 0: return 1               
    elif row['IstKrad'] !=0: return 1      
    elif row['IstGkfz'] !=0: return 1           
    elif row['IstFuss'] !=0: return 1
    elif row['IstSonstige'] !=0: return 1      
    else: return 0                        
df['not_alone'] = df.apply(not_alone_crash, axis=1)

In [8]:
#Data inconsistency: accidents where df.alone == 1 and df.UTYP1 == 6 (accident with other road user). 
#Considered as accident with cars 
df['IstPKW'] = np.where((df.alone == 1) & (df.UTYP1 == 6), 1 ,df.IstPKW)
df['alone'] = np.where((df.alone == 1) & (df.UTYP1 == 6), 0 ,df.alone)

In [9]:
df['alone'].value_counts()

0    4706
1     486
Name: alone, dtype: int64

In [10]:
df['IstPKW'].value_counts()

1    3726
0    1466
Name: IstPKW, dtype: int64

In [11]:
df['UTYP1'].value_counts()

2    1598
3    1395
5     649
6     603
7     392
1     311
4     244
Name: UTYP1, dtype: int64

In [12]:
#UTYP1 == 2 and UTYP1 == 3 are very similar. Both represent accidents in crosses. In UTYP2: 2 contains 3
df['UTYP2'] = np.where((df.UTYP1 == 3), 2, df.UTYP1)

In [13]:
df['UTYP2'].value_counts()

2    2993
5     649
6     603
7     392
1     311
4     244
Name: UTYP2, dtype: int64

In [14]:
#create a weekend column
def weekend_weekday(row):
    if row['UWOCHENTAG'] ==1: return 1               
    elif row['UWOCHENTAG'] ==7: return 1        
    else: return 0                        
df['weekend'] = df.apply(weekend_weekday, axis=1)

In [15]:
df.to_csv("../data/output/berlin_accidents_maxspeed_04.csv", index=False)