In [2]:
import pandas as pd
import numpy as np


# STEP 1: LOAD AND INITIAL INSPECTION

In [3]:
df = pd.read_csv("data/berlin_crime_final.csv")

In [24]:
df.shape

(21879, 10)

In [25]:
df.dtypes

LOR_ID           float64
Bezirk            object
Bezirksregion     object
CrimeType         object
Fallzahl         float64
year               int64
HZ               float64
geometry          object
lon              float64
lat              float64
dtype: object

In [26]:
df.head()

Unnamed: 0,LOR_ID,Bezirk,Bezirksregion,CrimeType,Fallzahl,year,HZ,geometry,lon,lat
0,11001.0,Mitte,Tiergarten Süd,Straftaten insgesamt,5475.0,2015,38575.0,MULTIPOLYGON (((13.373297493618068 52.50373208...,13.358181,52.510452
1,11002.0,Mitte,Regierungsviertel,Straftaten insgesamt,10313.0,2015,102068.0,MULTIPOLYGON (((13.389973675173831 52.50757271...,13.390625,52.514159
2,11003.0,Mitte,Alexanderplatz,Straftaten insgesamt,23524.0,2015,45938.0,MULTIPOLYGON (((13.423027234898033 52.51212031...,13.402246,52.52085
3,11004.0,Mitte,Brunnenstraße Süd,Straftaten insgesamt,4583.0,2015,16175.0,MULTIPOLYGON (((13.387830945640253 52.53339054...,13.393361,52.533828
4,12005.0,Mitte,Moabit West,Straftaten insgesamt,7428.0,2015,16612.0,MULTIPOLYGON (((13.343436370833128 52.52601880...,13.332072,52.530752


# STEP 2: DATA QUALITY CHECK

## Missing Values

In [9]:
df.isna().sum().sort_values(ascending=False)


LOR_ID           0
Bezirk           0
Bezirksregion    0
CrimeType        0
Fallzahl         0
year             0
HZ               0
geometry         0
lon              0
lat              0
dtype: int64

## Duplicates

In [10]:
df.duplicated().sum()


np.int64(0)

In [11]:
df["year"].nunique()


9

In [12]:
df["Bezirksregion"].nunique()


142

# STEP 3: UNIVARIATE ANALYSIS - CATEGORICAL

In [13]:
print("\n--- Unique Values per Column ---")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

print("\n--- Bezirk (District) Distribution ---")
print(df['Bezirk'].value_counts())

print("\n--- Crime Type Distribution ---")
print(df['CrimeType'].value_counts())

print("\n--- Year Distribution ---")
print(df['year'].value_counts().sort_index())

print("\n--- Top 10 Bezirksregion by Record Count ---")
print(df['Bezirksregion'].value_counts().head(10))


--- Unique Values per Column ---
LOR_ID: 143 unique values
Bezirk: 12 unique values
Bezirksregion: 142 unique values
CrimeType: 17 unique values
Fallzahl: 2548 unique values
year: 9 unique values
HZ: 8849 unique values
geometry: 143 unique values
lon: 143 unique values
lat: 143 unique values

--- Bezirk (District) Distribution ---
Bezirk
Treptow-Köpenick              3060
Pankow                        2448
Tempelhof-Schöneberg          2295
Lichtenberg                   1989
Neukölln                      1836
Reinickendorf                 1836
Charlottenburg-Wilmersdorf    1683
Mitte                         1530
Spandau                       1377
Marzahn-Hellersdorf           1377
Friedrichshain-Kreuzberg      1224
Steglitz-Zehlendorf           1224
Name: count, dtype: int64

--- Crime Type Distribution ---
CrimeType
Straftaten insgesamt                                      1287
Fahrrad diebstahl                                         1287
Rauschgift delikte                          

# STEP 4: UNIVARIATE ANALYSIS - NUMERICAL

In [15]:
print("\n--- Fallzahl (Case Count) Statistics ---")
print(f"Mean: {df['Fallzahl'].mean():.2f}")
print(f"Median: {df['Fallzahl'].median():.2f}")
print(f"Std Dev: {df['Fallzahl'].std():.2f}")
print(f"Min: {df['Fallzahl'].min()}")
print(f"Max: {df['Fallzahl'].max()}")

print("\n--- HZ (Crime Rate) Statistics ---")
if 'HZ' in df.columns:
    print(f"Mean: {df['HZ'].mean():.2f}")
    print(f"Median: {df['HZ'].median():.2f}")
    print(f"Std Dev: {df['HZ'].std():.2f}")
    print(f"Min: {df['HZ'].min()}")
    print(f"Max: {df['HZ'].max()}")


--- Fallzahl (Case Count) Statistics ---
Mean: 408.59
Median: 75.00
Std Dev: 1104.71
Min: 0.0
Max: 23524.0

--- HZ (Crime Rate) Statistics ---
Mean: 1506.51
Median: 314.00
Std Dev: 3634.47
Min: 0.0
Max: 106538.0


# STEP 5: BIVARIATE ANALYSIS

In [19]:
df_total = df[df["CrimeType"] == "Straftaten insgesamt"].copy()

print("\n--- Total Crimes by District (Straftaten insgesamt) ---")

district_crimes = (
    df_total
    .groupby("Bezirk")["Fallzahl"]
    .agg(
        total_crimes="sum",
        avg_annual_crimes="mean",
        years_observed="count"
    )
    .sort_values("total_crimes", ascending=False)
)

print(district_crimes)


--- Total Crimes by District (Straftaten insgesamt) ---
                            total_crimes  avg_annual_crimes  years_observed
Bezirk                                                                     
Mitte                           751656.0        8351.733333              90
Friedrichshain-Kreuzberg        506251.0        7031.263889              72
Charlottenburg-Wilmersdorf      455784.0        4603.878788              99
Neukölln                        384609.0        3561.194444             108
Tempelhof-Schöneberg            352452.0        2610.755556             135
Pankow                          344132.0        2389.805556             144
Lichtenberg                     251223.0        2147.205128             117
Reinickendorf                   243980.0        2259.074074             108
Spandau                         225430.0        2783.086420              81
Treptow-Köpenick                219293.0        1218.294444             180
Marzahn-Hellersdorf            

In [20]:
print("\n--- Total Crimes by Year (Straftaten insgesamt) ---")

year_crimes = (
    df_total
    .groupby("year")["Fallzahl"]
    .agg(
        total_crimes="sum",
        avg_district_crimes="mean",
        districts_count="count"
    )
)

print(year_crimes)



--- Total Crimes by Year (Straftaten insgesamt) ---
      total_crimes  avg_district_crimes  districts_count
year                                                    
2015      506633.0          3542.888112              143
2016      511850.0          3579.370629              143
2018      450588.0          3150.965035              143
2019      446775.0          3124.300699              143
2020      440566.0          3080.881119              143
2021      420853.0          2943.027972              143
2022      452689.0          3165.657343              143
2023      467657.0          3270.328671              143
2024      461407.0          3226.622378              143


In [21]:
print("\n--- Average Crime Rate (HZ) by District ---")

if "HZ" in df_total.columns:
    avg_hz_district = (
        df_total
        .groupby("Bezirk")["HZ"]
        .mean()
        .sort_values(ascending=False)
    )
    print(avg_hz_district)



--- Average Crime Rate (HZ) by District ---
Bezirk
Mitte                         25184.566654
Friedrichshain-Kreuzberg      20416.399580
Charlottenburg-Wilmersdorf    14240.875305
Neukölln                      12592.258701
Tempelhof-Schöneberg          11097.616288
Reinickendorf                 10761.378884
Lichtenberg                    9994.411508
Spandau                        9908.041914
Pankow                         9271.833695
Treptow-Köpenick               8802.263894
Steglitz-Zehlendorf            8296.765635
Marzahn-Hellersdorf            8277.117376
Name: HZ, dtype: float64


All aggregate analyses are based exclusively on the variable Straftaten insgesamt, which represents total recorded crime per district and year.
This avoids double-counting caused by summing hierarchical offense categories and ensures that reported totals are interpretable.

## Separate aggregated columns from others

In [7]:
aggregate_categories = [
    'Straftaten insgesamt',
    'Diebstahl insgesamt',
    'Körper verletzungen insgesamt',
    'Branddelikte insgesamt',
    'Sach beschädigung insgesamt'
]
# For detailed analysis
df_leaf_crimes = df[~df['CrimeType'].isin(aggregate_categories)]

# For high-level dashboards
df_aggregates = df[df['CrimeType'].isin(aggregate_categories)]

In [8]:
df_leaf_crimes.CrimeType.value_counts()

CrimeType
Raub                                                      1287
Straßenraub, Handtaschen raub                             1287
Gefährl. und schwere Körper verletzung                    1287
Freiheits beraubung, Nötigung, Bedrohung, Nachstellung    1287
Diebstahl von Kraftwagen                                  1287
Diebstahl an/aus Kfz                                      1287
Fahrrad diebstahl                                         1287
Wohnraum einbruch                                         1287
Brand stiftung                                            1287
Sach beschädigung durch Graffiti                          1287
Rauschgift delikte                                        1287
Kieztaten                                                 1287
Name: count, dtype: int64

# STEP 7: OUTLIER DETECTION (IQR Method)

In [None]:
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("\n--- Outliers in Fallzahl ---")
outliers_fallzahl, lb, ub = detect_outliers_iqr(df, 'Fallzahl')
print(f"Number of outliers: {len(outliers_fallzahl)}")
print(f"Lower bound: {lb:.2f}, Upper bound: {ub:.2f}")
if len(outliers_fallzahl) > 0:
    print("\nTop 10 outliers:")
    print(outliers_fallzahl.nlargest(10, 'Fallzahl')[['Bezirk', 'Bezirksregion', 'CrimeType', 'Fallzahl']])



--- Outliers in Fallzahl ---
Number of outliers: 3004
Lower bound: -364.00, Upper bound: 668.00

Top 5 outliers:
      Bezirk   Bezirksregion             CrimeType  Fallzahl
2      Mitte  Alexanderplatz  Straftaten insgesamt   23524.0
2433   Mitte  Alexanderplatz  Straftaten insgesamt   22803.0
7295   Mitte  Alexanderplatz  Straftaten insgesamt   21462.0
19450  Mitte  Alexanderplatz  Straftaten insgesamt   20197.0
17019  Mitte  Alexanderplatz  Straftaten insgesamt   19673.0
9726   Mitte  Alexanderplatz  Straftaten insgesamt   19287.0
2436   Mitte      Moabit Ost  Straftaten insgesamt   19267.0
4864   Mitte  Alexanderplatz  Straftaten insgesamt   19246.0
14588  Mitte  Alexanderplatz  Straftaten insgesamt   18999.0
12157  Mitte  Alexanderplatz  Straftaten insgesamt   15765.0
