In [1]:
import pandas as pd
import geopandas as gpd

In [33]:
def conduct_reverse_geocoding(df: pd.DataFrame, gdf_shape):
    """
    enriches the dataframe with the federal states, counties and municipalities the long-lat data points are located in
    :param df: the germany dataframe with the long-lat data
    :param gdf_shape: shapefile with the federal state, county and municipality borders for Germany
    :return: df: original dataframe with additional country column
    """
    # convert df to GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.long, df.lat))

    # Merge DataFrames
    pointInPolys = gpd.sjoin(gdf, gdf_shape, how='left')

    # Drop columns we don't need
    # pointInPolys = pointInPolys.drop(
    #     columns=['french_shor', 'status', 'index_right', 'color_code', 'status'])
    return pointInPolys

In [41]:
# Read shapefiles as geodataframe
sf_federal_states = gpd.GeoDataFrame.from_file("./data/shapefiles/germany/federal_states/B-2020-AI001-2-5--AI0106--2023-01-03.shp")

sf_counties = gpd.GeoDataFrame.from_file("./data/shapefiles/germany/counties/G-2020-AI001-2-5--AI0109--2023-01-03.shp")

sf_municipalities = gpd.GeoDataFrame.from_file("./data/shapefiles/germany/municipalities/G-2020-AI001-2-5--AI0106--2023-01-03.shp")

# load germany data as df
df_germany = pd.read_csv("./data/final_data/germany_final.csv", sep=";")


In [42]:
# convert CRS of shapefiles to fit CRS of CRS in Germany file (EPSG)
sf_federal_states = sf_federal_states.to_crs(4326)
sf_counties = sf_counties.to_crs(4326)
sf_municipalities = sf_municipalities.to_crs(4326)

In [43]:
# Perform geocoding
# shapefiles = [sf_federal_states, sf_counties, sf_municipalities]

# for shapefile in shapefiles:
#     df_germany = conduct_reverse_geocoding(df_germany, shapefile)
#     df_germany.drop(columns=["Unnamed","index_right"], inplace=True)

df_germany = conduct_reverse_geocoding(df_germany, sf_federal_states)


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  pointInPolys = gpd.sjoin(gdf, gdf_shape, how='left')


In [44]:
# Skip columns that are not needed
df_germany.drop(columns=["id", "schluessel", "jahr", "ai0106", "Shape_Leng", "Shape_Area", "long", "lat", "geometry", "index_right"], inplace=True)

df_germany.rename(columns={"gen":"federal_state"}, inplace=True)

In [45]:
df_germany.drop(columns=["Unnamed: 0"], inplace=True)

In [46]:
df_germany.head()

Unnamed: 0,avg_lat_ms,tests,devices,quarter,category,country,avg_d_mbps,avg_u_mbps,federal_state
0,43,4,2,2019-01-01,fixed,Germany,8,1,Schleswig-Holstein
1,34,1,1,2019-01-01,fixed,Germany,33,9,Schleswig-Holstein
2,29,1,1,2019-01-01,fixed,Germany,24,11,Schleswig-Holstein
3,40,2,1,2019-01-01,fixed,Germany,35,7,Schleswig-Holstein
4,33,4,3,2019-01-01,fixed,Germany,7,2,Schleswig-Holstein


In [47]:
df_germany.shape

(5347564, 9)

In [48]:
# save as csv
df_germany.to_csv("./data/final_data/germany_federal_states.csv", sep=";")