# Crime correlation

#### Imports & df reading

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import plotly.express as px

In [None]:
crimes = pd.read_csv('data/all_crimes_2022-2025.csv')

### LSOA

#### Crime correlation matrix for full database

In [None]:
crimes['Month'] = pd.to_datetime(crimes['Month'])

gdf_crimes = gpd.GeoDataFrame(
    crimes,
    geometry=gpd.points_from_xy(crimes['Longitude'], crimes['Latitude']),
    crs="EPSG:4326"
)

LSOAs = gpd.read_file('data/LSOAs.geojson').to_crs(epsg=4326)

gdf_joined_lsoa = gpd.sjoin(
    gdf_crimes,
    LSOAs[['geometry', 'LSOA11NM']],
    how='left',
    predicate='within'
)

# Count crimes per LSOA and crime type
lsoa_counts = (
    gdf_joined_lsoa
    .dropna(subset=['LSOA11NM'])
    .groupby(['LSOA11NM', 'Crime type'])
    .size()
    .reset_index(name='count')
)

# Create grid
lsoa_crime_grid = lsoa_counts.pivot(index='LSOA11NM', columns='Crime type', values='count').fillna(0).astype(int)

lsoa_crime_grid.head()

In [None]:
# Create correlation matrix
lsoa_corr = lsoa_crime_grid.corr(method='pearson')

plt.figure(figsize=(12, 8))
sns.heatmap(lsoa_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation of Crime Types (LSOA)')
plt.show()

#### Crime correlation matrix yearly & monthly

In [None]:
# Make Month datetime & add cols
#crimes['Month'] = pd.to_datetime(crimes['Month'])

crimes['Year'] = crimes['Month'].dt.year
crimes['YearMonth'] = crimes['Month'].dt.to_period('M').astype(str)

crimes.head()

In [None]:
# Function for yearly / monthly correlation matrix
def plot_lsoa_crime_corr(df, title_suffix=''):
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
        crs="EPSG:4326"
    )

    gdf = gpd.sjoin(gdf, LSOAs[['geometry', 'LSOA11NM']], how='left', predicate='within').dropna(subset=['LSOA11NM'])

    counts = (
        gdf
        .groupby(['LSOA11NM', 'Crime type'])
        .size()
        .reset_index(name='count')
    )

    grid = counts.pivot(index='LSOA11NM', columns='Crime type', values='count').fillna(0).astype(int)

    corr = grid.corr(method='pearson')

    plt.figure(figsize=(12, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation of Crime Types (LSOA) {title_suffix}')
    plt.show()

In [None]:
# Year & Month example
plot_lsoa_crime_corr(crimes[crimes['Year'] == 2023], title_suffix='(Year 2023)')

plot_lsoa_crime_corr(crimes[crimes['YearMonth'] == '2024-01'], title_suffix='(January 2024)')

#### Animated crime correlation matrix monthly

In [None]:
monthly_corrs_lsoa = {}
yearly_corrs_lsoa = {}

# Monthly correlation matrices
for period, df in gdf_joined_lsoa.groupby(gdf_joined_lsoa['Month'].dt.to_period('M')):
    pivot = (
        df.groupby(['LSOA11NM', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    monthly_corrs_lsoa[str(period)] = pivot.corr(method='pearson')

# Yearly correlation matrices
for year, df in gdf_joined_lsoa.groupby(gdf_joined_lsoa['Month'].dt.year):
    pivot = (
        df.groupby(['LSOA11NM', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    yearly_corrs_lsoa[str(year)] = pivot.corr(method='pearson')


In [None]:
frames = []

for period, corr in monthly_corrs_lsoa.items():
    corr.index.name = 'Crime A'
    melted = corr.reset_index().melt(id_vars='Crime A', var_name='Crime B', value_name='Correlation')
    melted['Month'] = period
    frames.append(melted)

animated_df = pd.concat(frames)

fig = px.density_heatmap(
    animated_df,
    x='Crime A',
    y='Crime B',
    z='Correlation',
    animation_frame='Month',
    color_continuous_scale='RdBu_r'
)

fig.update_layout(
    title='Monthly Crime Type Correlation (LSOA)',
    height=600,
    yaxis=dict(categoryorder='category descending')
)

fig.show()

#### Line plots of correlations for each crime type

In [None]:
# Plotting correlation for all crime type combinations
all_crimes = set()
for df in monthly_corrs_lsoa.values():
    all_crimes.update(df.columns)

for base_crime in sorted(all_crimes):
    correlation_over_time = []

    for month, corr_df in monthly_corrs_lsoa.items():
        if base_crime in corr_df.columns:
            for crime in corr_df.columns:
                if crime != base_crime:
                    correlation_over_time.append({
                        'Month': month,
                        'Crime Type': crime,
                        'Correlation': corr_df.at[base_crime, crime]
                    })

    # Make df for the crime
    corr_df = pd.DataFrame(correlation_over_time)
    corr_df['Month'] = pd.to_datetime(corr_df['Month'])
    corr_df = corr_df.sort_values('Month')

    fig = px.line(
        corr_df,
        x='Month',
        y='Correlation',
        color='Crime Type',
        markers=True,
        title=f'{base_crime} Correlation with Other Crime Types Over Time (LSOA)',
        range_y=[0, 1]
    )

    fig.update_layout(height=600)
    fig.show()

In [None]:
# # Define the crime type you want to visualize

# crime_types_dict = {
#     0: 'Anti-social behaviour',
#     1: 'Bicycle theft',
#     2: 'Burglary',
#     3: 'Criminal damage and arson',
#     4: 'Drugs',
#     5: 'Other crime',
#     6: 'Other theft',
#     7: 'Possession of weapons',
#     8: 'Public order',
#     9: 'Robbery',
#     10: 'Shoplifting',
#     11: 'Theft from the person',
#     12: 'Vehicle crime',
#     13: 'Violence and sexual offences'
# }

# # Change only the index below
# crime_type_to_display = crime_types_dict[2]

In [None]:
# gdf_crime = crimes[crimes['Crime type'] == crime_type_to_display]

# gdf_crime = gdf_crime.dropna(subset=['Latitude', 'Longitude'])

# gdf_crime = gpd.GeoDataFrame(
#     gdf_crime,
#     geometry=gpd.points_from_xy(gdf_crime['Longitude'], gdf_crime['Latitude']),
#     crs="EPSG:4326"
# )

# LSOAs = gpd.read_file('data/LSOAs.geojson').to_crs(epsg=4326)

# gdf_joined = gpd.sjoin(
#     gdf_crime,
#     LSOAs[['geometry', 'LSOA11NM']],
#     how='left',
#     predicate='within'
# )

# crime_counts = (
#     gdf_joined
#     .dropna(subset=['LSOA11NM'])
#     .groupby('LSOA11NM')
#     .size()
#     .reset_index(name='Count')
# )

# fig = px.choropleth_map(
#     crime_counts,
#     geojson=json.loads(LSOAs.to_json()),
#     locations='LSOA11NM',
#     featureidkey="properties.LSOA11NM",
#     color='Count',
#     color_continuous_scale="OrRd",
#     title=f'{crime_type_to_display} Heatmap by London LSOA',
#     hover_name="LSOA11NM",
#     map_style="open-street-map",
#     zoom=9,
#     center={"lat": 51.5072, "lon": -0.1276},
#     opacity=0.6,
#     height=600
# )

# fig.show()


### Ward

#### Crime correlation matrix for full database

In [None]:
crimes['Month'] = pd.to_datetime(crimes['Month'])

gdf_crimes = gpd.GeoDataFrame(
    crimes,
    geometry=gpd.points_from_xy(crimes['Longitude'], crimes['Latitude']),
    crs="EPSG:4326"
)

wards = gpd.read_file('data/wards.geojson').to_crs(epsg=4326)

gdf_joined_ward = gpd.sjoin(
    gdf_crimes,
    wards[['geometry', 'Name']],
    how='left',
    predicate='within'
)

# Count crimes per ward and crime type
ward_counts = (
    gdf_joined_ward
    .dropna(subset=['Name'])
    .groupby(['Name', 'Crime type'])
    .size()
    .reset_index(name='count')
)

# Create grid
ward_crime_grid = ward_counts.pivot(index='Name', columns='Crime type', values='count').fillna(0).astype(int)

ward_crime_grid.head()

In [None]:
# Create correlation matrix
ward_corr = ward_crime_grid.corr(method='pearson')

plt.figure(figsize=(12, 8))
sns.heatmap(ward_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation of Crime Types (Ward)')
plt.show()

#### Crime correlation matrix yearly & monthly

In [None]:
# Make Month datetime & add cols
#crimes['Month'] = pd.to_datetime(crimes['Month'])

crimes['Year'] = crimes['Month'].dt.year
crimes['YearMonth'] = crimes['Month'].dt.to_period('M').astype(str)

crimes.head()

In [None]:
# Function for yearly / monthly correlation matrix
def plot_ward_crime_corr(df, title_suffix=''):
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
        crs="EPSG:4326"
    )

    gdf = gpd.sjoin(gdf, wards[['geometry', 'Name']], how='left', predicate='within').dropna(subset=['Name'])

    counts = (
        gdf
        .groupby(['Name', 'Crime type'])
        .size()
        .reset_index(name='count')
    )

    grid = counts.pivot(index='Name', columns='Crime type', values='count').fillna(0).astype(int)

    corr = grid.corr(method='pearson')

    plt.figure(figsize=(12, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation of Crime Types (Ward) {title_suffix}')
    plt.show()

In [None]:
# Year & Month example
plot_ward_crime_corr(crimes[crimes['Year'] == 2023], title_suffix='(Year 2023)')

plot_ward_crime_corr(crimes[crimes['YearMonth'] == '2024-01'], title_suffix='(January 2024)')

#### Animated crime correlation matrix monthly

In [None]:
monthly_corrs_ward = {}
yearly_corrs_ward = {}

# Monthly correlation matrices
for period, df in gdf_joined_ward.groupby(gdf_joined_ward['Month'].dt.to_period('M')):
    pivot = (
        df.groupby(['Name', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    monthly_corrs_ward[str(period)] = pivot.corr(method='pearson')

# Yearly correlation matrices
for year, df in gdf_joined_ward.groupby(gdf_joined_ward['Month'].dt.year):
    pivot = (
        df.groupby(['Name', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    yearly_corrs_ward[str(year)] = pivot.corr(method='pearson')


In [None]:
frames = []

for period, corr in monthly_corrs_ward.items():
    corr.index.name = 'Crime A'
    melted = corr.reset_index().melt(id_vars='Crime A', var_name='Crime B', value_name='Correlation')
    melted['Month'] = period
    frames.append(melted)

animated_df = pd.concat(frames)

fig = px.density_heatmap(
    animated_df,
    x='Crime A',
    y='Crime B',
    z='Correlation',
    animation_frame='Month',
    color_continuous_scale='RdBu_r'
)

fig.update_layout(
    title='Monthly Crime Type Correlation (Ward)',
    height=600,
    yaxis=dict(categoryorder='category descending')
)

fig.show()

#### Line plots of correlations for each crime type

In [None]:
# Plotting correlation for all crime type combinations
all_crimes = set()
for df in monthly_corrs_ward.values():
    all_crimes.update(df.columns)

for base_crime in sorted(all_crimes):
    correlation_over_time = []

    for month, corr_df in monthly_corrs_ward.items():
        if base_crime in corr_df.columns:
            for crime in corr_df.columns:
                if crime != base_crime:
                    correlation_over_time.append({
                        'Month': month,
                        'Crime Type': crime,
                        'Correlation': corr_df.at[base_crime, crime]
                    })

    # Make df for the crime
    corr_df = pd.DataFrame(correlation_over_time)
    corr_df['Month'] = pd.to_datetime(corr_df['Month'])
    corr_df = corr_df.sort_values('Month')

    fig = px.line(
        corr_df,
        x='Month',
        y='Correlation',
        color='Crime Type',
        markers=True,
        title=f'{base_crime} Correlation with Other Crime Types Over Time (Ward)',
        range_y=[0, 1]
    )

    fig.update_layout(height=600)
    fig.show()

### Borough

#### Crime correlation matrix for full database

In [None]:
crimes['Month'] = pd.to_datetime(crimes['Month'])

gdf_crimes = gpd.GeoDataFrame(
    crimes,
    geometry=gpd.points_from_xy(crimes['Longitude'], crimes['Latitude']),
    crs="EPSG:4326"
)

boroughs = gpd.read_file('data/boroughs.geojson').to_crs(epsg=4326)

gdf_joined_borough = gpd.sjoin(
    gdf_crimes,
    boroughs[['geometry', 'Name']],
    how='left',
    predicate='within'
)

# Count crimes per borough and crime type
borough_counts = (
    gdf_joined_borough
    .dropna(subset=['Name'])
    .groupby(['Name', 'Crime type'])
    .size()
    .reset_index(name='count')
)

# Create grid
borough_crime_grid = borough_counts.pivot(index='Name', columns='Crime type', values='count').fillna(0).astype(int)

borough_crime_grid.head()

In [None]:
# Create correlation matrix
borough_corr = borough_crime_grid.corr(method='pearson')

plt.figure(figsize=(12, 8))
sns.heatmap(borough_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation of Crime Types (Borough)')
plt.show()

#### Crime correlation matrix yearly & monthly

In [None]:
# Make Month datetime & add cols
#crimes['Month'] = pd.to_datetime(crimes['Month'])

crimes['Year'] = crimes['Month'].dt.year
crimes['YearMonth'] = crimes['Month'].dt.to_period('M').astype(str)

crimes.head()

In [None]:
# Function for yearly / monthly correlation matrix
def plot_borough_crime_corr(df, title_suffix=''):
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
        crs="EPSG:4326"
    )

    gdf = gpd.sjoin(gdf, wards[['geometry', 'Name']], how='left', predicate='within').dropna(subset=['Name'])

    counts = (
        gdf
        .groupby(['Name', 'Crime type'])
        .size()
        .reset_index(name='count')
    )

    grid = counts.pivot(index='Name', columns='Crime type', values='count').fillna(0).astype(int)

    corr = grid.corr(method='pearson')

    plt.figure(figsize=(12, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation of Crime Types (Borough) {title_suffix}')
    plt.show()

In [None]:
# Year & Month example
plot_borough_crime_corr(crimes[crimes['Year'] == 2023], title_suffix='(Year 2023)')

plot_borough_crime_corr(crimes[crimes['YearMonth'] == '2024-01'], title_suffix='(January 2024)')

#### Animated crime correlation matrix monthly

In [None]:
monthly_corrs_borough = {}
yearly_corrs_borough = {}

# Monthly correlation matrices
for period, df in gdf_joined_borough.groupby(gdf_joined_borough['Month'].dt.to_period('M')):
    pivot = (
        df.groupby(['Name', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    monthly_corrs_borough[str(period)] = pivot.corr(method='pearson')

# Yearly correlation matrices
for year, df in gdf_joined_borough.groupby(gdf_joined_borough['Month'].dt.year):
    pivot = (
        df.groupby(['Name', 'Crime type'])
        .size()
        .unstack(fill_value=0)
        .astype(int)
    )
    yearly_corrs_borough[str(year)] = pivot.corr(method='pearson')


In [None]:
frames = []

for period, corr in monthly_corrs_borough.items():
    corr.index.name = 'Crime A'
    melted = corr.reset_index().melt(id_vars='Crime A', var_name='Crime B', value_name='Correlation')
    melted['Month'] = period
    frames.append(melted)

animated_df = pd.concat(frames)

fig = px.density_heatmap(
    animated_df,
    x='Crime A',
    y='Crime B',
    z='Correlation',
    animation_frame='Month',
    color_continuous_scale='RdBu_r'
)

fig.update_layout(
    title='Monthly Crime Type Correlation (Borough)',
    height=600,
    yaxis=dict(categoryorder='category descending')
)

fig.show()

#### Line plots of correlations for each crime type

In [None]:
# Plotting correlation for all crime type combinations
all_crimes = set()
for df in monthly_corrs_borough.values():
    all_crimes.update(df.columns)

for base_crime in sorted(all_crimes):
    correlation_over_time = []

    for month, corr_df in monthly_corrs_borough.items():
        if base_crime in corr_df.columns:
            for crime in corr_df.columns:
                if crime != base_crime:
                    correlation_over_time.append({
                        'Month': month,
                        'Crime Type': crime,
                        'Correlation': corr_df.at[base_crime, crime]
                    })

    # Make df for the crime
    corr_df = pd.DataFrame(correlation_over_time)
    corr_df['Month'] = pd.to_datetime(corr_df['Month'])
    corr_df = corr_df.sort_values('Month')

    fig = px.line(
        corr_df,
        x='Month',
        y='Correlation',
        color='Crime Type',
        markers=True,
        title=f'{base_crime} Correlation with Other Crime Types Over Time (Borough)',
        range_y=[0, 1]
    )

    fig.update_layout(height=600)
    fig.show()

### Correlated crime pairs

In [None]:
def get_top_correlated_pairs(corr_matrix, top_n=5):
    corr_matrix = corr_matrix.copy()
    pairs = []

    for i, crime_a in enumerate(corr_matrix.columns):
        for j, crime_b in enumerate(corr_matrix.columns):
            if j > i:
                corr_value = corr_matrix.loc[crime_a, crime_b]
                pairs.append((crime_a, crime_b, corr_value))
    
    sorted_pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)
    return pd.DataFrame(sorted_pairs[:top_n], columns=['Crime A', 'Crime B', 'Correlation'])


In [None]:
yearly_granularities = {
    "LSOA": yearly_corrs_lsoa,
    "Ward": yearly_corrs_ward,
    "Borough": yearly_corrs_borough
}

def get_top_correlated_pairs(corr_matrix, top_n=5):
    if corr_matrix is None:
        return pd.DataFrame()
    
    corr_matrix = corr_matrix.copy()
    pairs = []
    
    for i, crime_a in enumerate(corr_matrix.columns):
        for j, crime_b in enumerate(corr_matrix.columns):
            if i < j:
                pairs.append({
                    'Crime A': crime_a,
                    'Crime B': crime_b,
                    'Correlation': corr_matrix.iloc[i, j]
                })
    
    top_pairs = pd.DataFrame(pairs).sort_values(by='Correlation', ascending=False).head(top_n)
    return top_pairs

for granularity in ['LSOA', 'Ward', 'Borough']:
    print(f"\n== Top correlated crime pairs by {granularity} ==")
    
    yearly_corr_dict = yearly_granularities[granularity]
    for year, corr in yearly_corr_dict.items():
        print(f"\nYear: {year}")
        display(get_top_correlated_pairs(corr, top_n=5))


### Spatial hotspots Burglary correlations

In [None]:
def compute_spatial_correlation(df, area_col, crimes_to_include, focus_crime='Burglary'):
    pivot = df.groupby([area_col, 'Crime type']).size().unstack(fill_value=0)
    correlations = {}
    
    for crime in crimes_to_include:
        if crime != focus_crime and crime in pivot.columns:
            correlations[crime] = pivot[focus_crime].corr(pivot[crime])
    
    return pd.Series(correlations, name='Correlation')


In [None]:
def plot_spatial_correlation_map(df, gdf, area_col, area, focus_crime='Burglary'):
    crimes = df['Crime type'].unique()
    
    for other_crime in crimes:
        if other_crime == focus_crime:
            continue
        
        # Filter only two crime types
        subset = df[df['Crime type'].isin([focus_crime, other_crime])]
        
        # Count crimes per area
        counts = subset.groupby([area_col, 'Crime type']).size().unstack(fill_value=0)
        
        # Skip if insufficient data
        if focus_crime not in counts.columns or other_crime not in counts.columns:
            continue
        
        # Calculate average intensity
        counts['Joint Intensity'] = (counts[focus_crime] + counts[other_crime]) / 2

        merged = gdf.merge(counts, left_on=area_col, right_index=True, how='left')

        fig = px.choropleth_map(
            merged,
            geojson=merged.geometry,
            locations=merged.index,
            color='Joint Intensity',
            color_continuous_scale='Viridis',
            map_style='carto-positron',
            zoom=9,
            center={"lat": 51.5072, "lon": -0.1276},
            opacity=0.7,
            title=f"Burglary & {other_crime} Intensity ({area})",
            height=600
        )
        fig.show()


In [None]:
plot_spatial_correlation_map(gdf_joined_lsoa, LSOAs, 'LSOA11NM', "LSOA", focus_crime='Burglary')

In [None]:
plot_spatial_correlation_map(gdf_joined_ward, wards, 'Name', "Ward", focus_crime='Burglary')

In [None]:
plot_spatial_correlation_map(gdf_joined_borough, boroughs, 'Name', "Borough", focus_crime='Burglary')

### Conclusions

I looked at correlation between the different crime types, for this I generated and analyzed correlation based heatmaps for the full dataset, yearly data, and monthly data for the following three different granularity levels: LSOA, Ward, Borough (I also created a heatmap with a monthly slider for each granularity level). Furthermore I generated a line graph for each crime type with a line indicating the correlation for all other crimes on a monthly base, also for the three granularity levels above. Finally, I generated spatial correlation maps visualizing the intensity of Burglary and all other crime types for all three granularity levels.

Based on the above described generated visuals, I conclude the following:
- The correlation between: 'Other theft', 'Theft from the person', and 'Robbery' is very high and stable for all granularities and timeframes
- 'Other crime' has a very low correlation with all other crime types
- The correlation between: 'Vehicle crime' and 'Bicycle theft' is very low and stable for all granularities and timeframes
- Crime correlation analysis on Borough level is rather difficult since larger area means higher frequency for all crimes, thus resulting in high correlation values
- There does not seem to be notable significance of correlation pairs that experience temporal patterns

For Burglary, the spatial correlation maps do not seem very useful. High intensity areas seem to originate mostly from one of the crime types to be very frequent; giving the false idea of correlation. This is likely because this correlation testing uses raw correlation on crime counts, so areas with more crime in general, could depict a high level of correlation too (Central London for example). The correlation could mostly be picking up general crime intensity, instead of meaningful insights in terms of correlation. Conclusion for Burglary, there does not seem to be any significant correlation between Burglary and any other crime type (using absolute crime counts for correlation computation).