# London Crime CNN Classifier
Predicting **Crime type** using TensorFlow/Keras Conv1D Neural Network


In [1]:
# Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Conv1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import seaborn as sns

2025-12-15 15:43:38.575972: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-15 15:43:39.239522: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-15 15:43:41.390267: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# GPU Configuration
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Enable memory growth to prevent TensorFlow from allocating all GPU memory at once
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU(s) configured: {[gpu.name for gpu in gpus]}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, running on CPU")

# Enable mixed precision for better GPU performance
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)

TensorFlow version: 2.20.0
Num GPUs Available:  1
GPU(s) configured: ['/physical_device:GPU:0']
Compute dtype: float16
Variable dtype: float32


## Load Dataset

In [None]:
df = pd.read_csv("Dataset/mergedCRDataset.csv", sep="\t", engine="python", on_bad_lines="skip")

df.head()

## Data Preprocessing

In [None]:
print(df.columns)

# Drop duplicates rows
df.drop_duplicates(inplace=True)

# Drop the Crime ID.
df.drop(columns=["Crime ID"], inplace=True)

In [5]:
# Strip column names
# REASON: In some cases, when reading CSV files, extra spaces can be inadvertently added in the column names.
# This can lead to issues when trying to access these columns later in the code, as the names won't match exactly.
df.columns = df.columns.str.strip()

In [6]:
# Impute missing geographic coordinates using median values grouped by location
numeric_cols = ['Longitude', 'Latitude']
group_col = 'LSOA name'   # adjust this to the most relevant geographic field

# Calculate median coordinates within each geographic group
group_medians = df.groupby(group_col)[numeric_cols].median()

# Attach group medians to the dataframe
df = df.merge(group_medians, left_on=group_col, right_index=True, how='left', suffixes=('', '_grp'))

# Fill missing values using the median of the corresponding group
for col in numeric_cols:
    df[col] = df[col].fillna(df[f'{col}_grp'])

# Fallback to global median if an entire group has missing coordinates
global_medians = df[numeric_cols].median()
df[numeric_cols] = df[numeric_cols].fillna(global_medians)

# Remove helper columns containing group medians
df.drop(columns=[f'{c}_grp' for c in numeric_cols], inplace=True)


# Encode target values for classification
target_col = 'Crime type'
encoder = LabelEncoder()
df['target_encoded'] = encoder.fit_transform(df[target_col])
num_classes = df['target_encoded'].nunique()


In [None]:
# Features: numeric and categorical
numeric_cols = ['Longitude', 'Latitude']
categorical_cols = ['Reported by', 'Falls within']   # safe one-hot features

# One-hot encode selected categorical features
df_encoded = pd.get_dummies(df[categorical_cols])

# Concatenate numeric + categorical
X = pd.concat([df[numeric_cols], df_encoded], axis=1).values

# Target already label-encoded earlier
y = to_categorical(df['target_encoded'], num_classes=num_classes)

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")


In [None]:
# Standardize numeric features
scaler = StandardScaler()
X[:, :len(numeric_cols)] = scaler.fit_transform(X[:, :len(numeric_cols)])

X = X.astype("float32")  # ensure proper dtype

# Reshape for Conv1D: (samples, timesteps, features)
# Treat each feature as a timestep for Conv1D processing
X = X.reshape(X.shape[0], X.shape[1], 1)
print(f"Reshaped X for Conv1D: {X.shape}")

## Train/Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Model Definition

In [None]:
# GPU-Optimized Conv1D Model
model = Sequential([
    Input(shape=(X_train.shape[1], X_train.shape[2])),
    
    # First Conv1D block
    Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    Dropout(0.3),
    
    # Second Conv1D block
    Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    Dropout(0.3),
    
    # Third Conv1D block
    Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    
    # Global pooling to reduce dimensions
    GlobalMaxPooling1D(),
    
    # Dense layers
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(64, activation='relu'),
    Dropout(0.3),
    
    # Output layer with float32 dtype for numerical stability
    Dense(y.shape[1], activation='softmax', dtype='float32')
])

# Compile with optimized settings for GPU
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.summary()

# Create Model directory if it doesn't exist
save_path = "Model"
os.makedirs(save_path, exist_ok=True)

## Training

In [None]:
# Training with GPU-optimized settings
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=512,  # Larger batch size for better GPU utilization
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7
        )
    ]
)

# Save the trained model
model.save(os.path.join(save_path, "network.h5"))
print(f"Model saved to {save_path}/network.h5")

In [None]:
#make predictions (will give a probability distribution)
pred_hot = model.predict(X_test)
#now pick the most likely outcome
pred = np.argmax(pred_hot,axis=1)
y_compare = np.argmax(y_test,axis=1) 
#calculate accuracy
score = metrics.accuracy_score(y_compare, pred)

print("Accuracy score: {}".format(score))

print(pred_hot[:5])
print(pred)

## Evaluation

In [None]:
# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Classification Report
print(classification_report(y_true_classes, y_pred_classes, target_names=encoder.classes_))

In [None]:
#
model.eval()
all_preds, all_true = [], []
all_probs = []
with torch.no_grad():
    for bx, by in test_loader:
        bx = bx.to(device)
        logits = model(bx)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_probs.extend(probs)
        all_true.extend(by.numpy())

all_preds = np.array(all_preds)
all_probs = np.array(all_probs)
all_true = np.array(all_true)

#
print("=" * 80)
print("COMPREHENSIVE MODEL EVALUATION METRICS")
print("=" * 80)
print("\n Classification Report:")
print(classification_report(all_true, all_preds))

# 1. Macro F1 Score
from sklearn.metrics import f1_score
macro_f1 = f1_score(all_true, all_preds, average='macro', zero_division=0)
weighted_f1 = f1_score(all_true, all_preds, average='weighted', zero_division=0)
print(f"\n 1. F1 SCORES:")
print(f"   Macro F1 (treats all classes equally): {macro_f1:.4f}")
print(f"   Weighted F1 (accounts for class imbalance): {weighted_f1:.4f}")

# 2. Macro Recall (Sensitivity per class)
from sklearn.metrics import recall_score
macro_recall = recall_score(all_true, all_preds, average='macro', zero_division=0)
weighted_recall = recall_score(all_true, all_preds, average='weighted', zero_division=0)
per_class_recall = recall_score(all_true, all_preds, average=None, zero_division=0)
print(f"\n 2. RECALL SCORES (Detection Rate):")
print(f"   Macro Recall: {macro_recall:.4f}")
print(f"   Weighted Recall: {weighted_recall:.4f}")
print(f"   Per-class Recall:")
for i, recall in enumerate(per_class_recall):
    print(f"      Class {i}: {recall:.4f}")

# 3. Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_true, all_preds)
print(f"\n 3. Confusion Matrix")
print(cm)

# 4. AUROC / ROC-AUC (macro averaged)
from sklearn.metrics import roc_auc_score, accuracy_score
try:
    # For multi-class: use macro and weighted averaging
    if num_classes == 2:
        macro_auc = roc_auc_score(all_true, all_probs[:, 1])
    else:
        macro_auc = roc_auc_score(all_true, all_probs, multi_class='ovr', average='macro')
        weighted_auc = roc_auc_score(all_true, all_probs, multi_class='ovr', average='weighted')
    
    print(f"\n 4. AUROC / ROC-AUC Score:")
    if num_classes == 2:
        print(f"   Binary AUC: {macro_auc:.4f}")
    else:
        print(f"   Macro AUC (One-vs-Rest): {macro_auc:.4f}")
        print(f"   Weighted AUC: {weighted_auc:.4f}")
except Exception as e:
    print(f"\n 4. AUROC / ROC-AUC:")
    print(f"   Could not compute AUC: {str(e)}")

# Overall Accuracy
accuracy = accuracy_score(all_true, all_preds)
print(f"\n Overall Accuracy: {accuracy:.4f}")
print("=" * 80)

Data Analysis/Visualization

In [None]:
# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Confusion Matrix Heatmap
ax1 = axes[0, 0]
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar_kws={'label': 'Count'})
ax1.set_title('Confusion Matrix (Raw Counts)', fontsize=14, fontweight='bold')
ax1.set_ylabel('True Label', fontsize=12)
ax1.set_xlabel('Predicted Label', fontsize=12)

# 2. Normalized Confusion Matrix
ax2 = axes[0, 1]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='RdYlGn', ax=ax2, cbar_kws={'label': 'Recall %'})
ax2.set_title('Confusion Matrix (Normalized by True Label)', fontsize=14, fontweight='bold')
ax2.set_ylabel('True Label', fontsize=12)
ax2.set_xlabel('Predicted Label', fontsize=12)

# 3. Training Loss and Validation Accuracy
ax3 = axes[1, 0]
epochs_range = range(1, len(loss_across_epochs) + 1)
ax3_twin = ax3.twinx()
line1 = ax3.plot(epochs_range, loss_across_epochs, 'b-o', label='Training Loss', linewidth=2, markersize=6)
line2 = ax3_twin.plot(epochs_range, val_accuracy_across_epochs, 'g-s', label='Validation Accuracy', linewidth=2, markersize=6)
ax3.set_xlabel('Epoch', fontsize=12)
ax3.set_ylabel('Loss', fontsize=12, color='b')
ax3_twin.set_ylabel('Accuracy', fontsize=12, color='g')
ax3.tick_params(axis='y', labelcolor='b')
ax3_twin.tick_params(axis='y', labelcolor='g')
ax3.grid(True, alpha=0.3)
ax3.set_title('Training Loss vs Validation Accuracy', fontsize=14, fontweight='bold')

# Combine legends
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax3.legend(lines, labels, loc='upper left', fontsize=10)

# 4. Per-Class Recall Bar Chart
ax4 = axes[1, 1]
class_names = [f'Class {i}' for i in range(len(per_class_recall))]
colors = ['green' if r > 0.5 else 'orange' if r > 0.3 else 'red' for r in per_class_recall]
bars = ax4.bar(class_names, per_class_recall, color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax4.axhline(y=macro_recall, color='r', linestyle='--', linewidth=2, label=f'Macro Recall: {macro_recall:.4f}')
ax4.set_ylabel('Recall', fontsize=12)
ax4.set_title('Per-Class Recall (Detection Rate)', fontsize=14, fontweight='bold')
ax4.set_ylim([0, 1])
ax4.legend(fontsize=10)
ax4.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, recall in zip(bars, per_class_recall):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height,
             f'{recall:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('model_evaluation_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:

# Additional metrics visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. F1 Scores Comparison
ax1 = axes[0]
metrics = ['Macro F1', 'Weighted F1']
scores = [macro_f1, weighted_f1]
colors_metrics = ['#FF6B6B', '#4ECDC4']
bars = ax1.bar(metrics, scores, color=colors_metrics, alpha=0.8, edgecolor='black', linewidth=2)
ax1.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax1.set_title('F1 Scores Comparison', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 1])
ax1.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars, scores):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{score:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

# 2. Macro Metrics Summary
ax2 = axes[1]
macro_precision = precision_score(all_true, all_preds, average='macro', zero_division=0)
summary_metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
summary_values = [accuracy, macro_precision, macro_recall, macro_f1]
colors_summary = ['#95E1D3', '#F38181', '#AA96DA', '#FCBAD3']

bars = ax2.barh(summary_metrics, summary_values, color=colors_summary, alpha=0.8, edgecolor='black', linewidth=2)
ax2.set_xlabel('Score', fontsize=12, fontweight='bold')
ax2.set_title('Macro-Averaged Metrics Summary', fontsize=14, fontweight='bold')
ax2.set_xlim([0, 1])
ax2.grid(True, alpha=0.3, axis='x')

for bar, value in zip(bars, summary_values):
    width = bar.get_width()
    ax2.text(width, bar.get_y() + bar.get_height()/2.,
             f'{value:.4f}', ha='left', va='center', fontsize=11, fontweight='bold', 
             bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig('metrics_summary.png', dpi=300, bbox_inches='tight')
plt.show()



In [None]:

# ROC-AUC Curves (One-vs-Rest for multi-class)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# 1. ROC Curve (One-vs-Rest)
ax1 = axes[0]
if num_classes > 2:
    # Multi-class: use label binarization
    all_true_bin = label_binarize(all_true, classes=range(num_classes))
    
    colors = plt.cm.Set3(np.linspace(0, 1, num_classes))
    auc_scores = []
    
    for i in range(min(num_classes, 5)):  # Limit to 5 classes for clarity
        fpr, tpr, _ = roc_curve(all_true_bin[:, i], all_probs[:, i])
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)
        ax1.plot(fpr, tpr, color=colors[i], lw=2, label=f'Class {i} (AUC = {roc_auc:.3f})')
    
    # Micro-average
    fpr, tpr, _ = roc_curve(all_true_bin.ravel(), all_probs.ravel())
    roc_auc_micro = auc(fpr, tpr)
    ax1.plot(fpr, tpr, color='deeppink', lw=3, linestyle=':', label=f'Micro-average (AUC = {roc_auc_micro:.3f})')
    
    ax1.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
    ax1.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
    ax1.set_title('ROC Curves (One-vs-Rest) - Top 5 Classes', fontsize=13, fontweight='bold')
    ax1.legend(loc="lower right", fontsize=9)
    ax1.grid(True, alpha=0.3)
else:
    # Binary classification
    fpr, tpr, _ = roc_curve(all_true, all_probs[:, 1])
    roc_auc = auc(fpr, tpr)
    ax1.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
    ax1.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
    ax1.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
    ax1.set_title('ROC Curve', fontsize=13, fontweight='bold')
    ax1.legend(loc="lower right", fontsize=10)
    ax1.grid(True, alpha=0.3)

# 2. Per-Class Metrics Heatmap
ax2 = axes[1]
precision, recall, f1, support = precision_recall_fscore_support(all_true, all_preds, 
                                                                   average=None, zero_division=0)

metrics_data = np.array([precision, recall, f1]).T
im = ax2.imshow(metrics_data, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)

# Set ticks and labels
ax2.set_xticks([0, 1, 2])
ax2.set_xticklabels(['Precision', 'Recall', 'F1'], fontsize=11, fontweight='bold')
ax2.set_yticks(range(min(len(precision), 10)))
ax2.set_yticklabels([f'Class {i}' for i in range(min(len(precision), 10))], fontsize=10)
ax2.set_title('Per-Class Metrics Heatmap (Top 10 Classes)', fontsize=13, fontweight='bold')

# Add text annotations
for i in range(min(len(precision), 10)):
    for j in range(3):
        text = ax2.text(j, i, f'{metrics_data[i, j]:.2f}',
                       ha="center", va="center", color="black", fontsize=9, fontweight='bold')

cbar = plt.colorbar(im, ax=ax2)
cbar.set_label('Score', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('roc_and_metrics_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()