1) Mount Colab.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


2) Inspect the thoroughly cleaned train dataset.

In [None]:
import pandas as pd

# Load dataset
train_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_train_final.csv"
df = pd.read_csv(train_dir)

# Check datatypes
print(df.dtypes)

# Count numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

print(f"Numerical columns ({len(num_cols)}): {list(num_cols)}")
print(f"Categorical columns ({len(cat_cols)}): {list(cat_cols)}")

highest_offense_description     object
highest_offense_code             int64
family_violence                 object
occurred_date_time_year          int64
sine_odt_month                 float64
cosine_odt_month               float64
sine_odt_week_of_year          float64
cosine_odt_week_of_year        float64
sine_odt_day                   float64
cosine_odt_day                 float64
sine_odt_day_of_week           float64
cosine_odt_day_of_week         float64
sine_odt_hour                  float64
cosine_odt_hour                float64
sine_odt_minute                float64
cosine_odt_minute              float64
sine_occurred_time             float64
cosine_occurred_time           float64
sine_report_time               float64
cosine_report_time             float64
report_date_time_year            int64
sine_rdt_week_of_year          float64
cosine_rdt_week_of_year        float64
sine_rdt_day                   float64
cosine_rdt_day                 float64
sine_rdt_hour            

3) Convert the apd_district and highest_offense_code alone to categorical.

In [None]:
cat_convert = ['apd_district', 'highest_offense_code']

for col in cat_convert:
    df[col] = df[col].astype('category')

print(df[cat_convert].dtypes)

apd_district            category
highest_offense_code    category
dtype: object


4) Load and inspect the test dataset.

In [None]:
import pandas as pd

# Load dataset
test_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_test_cleaned.csv"
dft = pd.read_csv(test_dir)

# Check datatypes
print(dft.dtypes)

# Count numerical and categorical columns
num_cols = dft.select_dtypes(include=['int64', 'float64']).columns
cat_cols = dft.select_dtypes(include=['object', 'category', 'bool']).columns

print(f"Numerical columns ({len(num_cols)}): {list(num_cols)}")
print(f"Categorical columns ({len(cat_cols)}): {list(cat_cols)}")

highest_offense_description     object
highest_offense_code             int64
family_violence                 object
occurred_date_time_year          int64
sine_odt_month                 float64
cosine_odt_month               float64
sine_odt_week_of_year          float64
cosine_odt_week_of_year        float64
sine_odt_day                   float64
cosine_odt_day                 float64
sine_odt_day_of_week           float64
cosine_odt_day_of_week         float64
sine_odt_hour                  float64
cosine_odt_hour                float64
sine_odt_minute                float64
cosine_odt_minute              float64
sine_occurred_time             float64
cosine_occurred_time           float64
sine_report_time               float64
cosine_report_time             float64
report_date_time_year            int64
sine_rdt_week_of_year          float64
cosine_rdt_week_of_year        float64
sine_rdt_day                   float64
cosine_rdt_day                 float64
sine_rdt_hour            

5) Convert those columns back to categorical as we face the same problem with the test dataset as well.

In [None]:
cat_convert = ['apd_district', 'highest_offense_code']

for col in cat_convert:
    dft[col] = dft[col].astype('category')

print(dft[cat_convert].dtypes)

apd_district            category
highest_offense_code    category
dtype: object


6) Check for NAs in both of them.

In [None]:
import pandas as pd

# Load train
train_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_train_final.csv"
df_train = pd.read_csv(train_dir)

# Load test
test_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_test_cleaned.csv"
df_test = pd.read_csv(test_dir)

# Check for NAs in train
if df_train.isna().sum().sum() == 0:
    print("No NAs in train dataset.")
else:
    print("There are missing values in train dataset.")

# Check for NAs in test
if df_test.isna().sum().sum() == 0:
    print("No NAs in test dataset.")
else:
    print("There are missing values in test dataset.")

No NAs in train dataset.
No NAs in test dataset.


7) Modelling - HistGradientBoosting

Note: The balanced accuracy, and recall are lame parameters. So they were not used in the report.

In [None]:
import sklearn
print(sklearn.__version__)
from sklearn.ensemble import HistGradientBoostingClassifier

1.6.1


In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, confusion_matrix
)
from sklearn.ensemble import HistGradientBoostingClassifier

# --- Set seed for reproducibility ---
SEED = 1906525
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# --- Paths ---
train_path = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_train_final.csv"
test_path = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Datasets/crime_test_cleaned.csv"
perf_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Results/HistGradientBoosting/Performance Metrics"
cm_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Results/HistGradientBoosting/Confusion Matrices"
test_pred_dir = "/content/drive/MyDrive/Big Data Analysis and Project - a1906525/Results/HistGradientBoosting/Test Results"
for d in [perf_dir, cm_dir, test_pred_dir]:
    os.makedirs(d, exist_ok=True)

# --- Data load ---
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

cat_cols = [
    'highest_offense_description', 'family_violence', 'location_type',
    'apd_sector', 'apd_district', 'highest_offense_code'
]
target = "clearance_status"
features = [c for c in df_train.columns if c != target]

# --- Bucketing high-cardinality categorical columns ---
def cap_categories(series, max_cats=250, other_label='Other'):
    counts = series.value_counts()
    keep = counts.index[:max_cats]
    return series.where(series.isin(keep), other_label)

for col in ['highest_offense_description', 'highest_offense_code']:
    df_train[col] = cap_categories(df_train[col].astype(str), 250)
    df_test[col] = cap_categories(df_test[col].astype(str), 250)

# Convert all categorical columns (including int/float) to string, then to category dtype for sklearn HGB
for c in cat_cols + [target]:
    df_train[c] = df_train[c].astype(str).astype('category')
    df_test[c] = df_test[c].astype(str).astype('category')

# Align category sets across train/test
for c in cat_cols + [target]:
    all_cats = pd.Index(sorted(set(df_train[c].cat.categories).union(set(df_test[c].cat.categories))))
    df_train[c] = df_train[c].cat.set_categories(all_cats)
    df_test[c] = df_test[c].cat.set_categories(all_cats)

# --- Split train/val ---
train, val = train_test_split(
    df_train, test_size=0.2, stratify=df_train[target], random_state=SEED
)

for c in cat_cols + [target]:
    train[c] = train[c].astype('category')
    val[c] = val[c].astype('category')
    train[c] = train[c].cat.set_categories(df_train[c].cat.categories)
    val[c] = val[c].cat.set_categories(df_train[c].cat.categories)

X_train = train[features].copy().reset_index(drop=True)
y_train = train[target].cat.codes.values.ravel()
X_val = val[features].copy().reset_index(drop=True)
y_val = val[target].cat.codes.values.ravel()
X_test = df_test[features].copy().reset_index(drop=True)
y_test = df_test[target].cat.codes.values.ravel()

# Ensure categorical dtype in X matrices
for c in cat_cols:
    X_train[c] = X_train[c].astype('category')
    X_val[c] = X_val[c].astype('category')
    X_test[c] = X_test[c].astype('category')

labels = list(train[target].cat.categories)
n_classes = len(labels)

print("\nSplit sizes:")
print("  X_train", X_train.shape, "y_train", y_train.shape)
print("  X_val", X_val.shape, "y_val", y_val.shape)
print("  X_test", X_test.shape, "y_test", y_test.shape)

# --- Metric functions ---
def compute_metrics(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=range(len(labels)))
    if cm.shape == (2,2):
        tn, fp, fn, tp = cm.ravel()
        spec = tn/(tn+fp) if (tn+fp) else 0
        sens = tp/(tp+fn) if (tp+fn) else 0
    else:
        spec = sens = 0
    return cm, spec, sens

def get_metrics(y_true, y_pred, probs, labels):
    acc = accuracy_score(y_true, y_pred)
    y_idx = np.arange(len(y_true))
    if probs.shape[0] != len(y_true):
        probs = probs[:len(y_true), :]
    loss = -np.mean(np.log(probs[y_idx, y_true] + 1e-12))
    avg = 'binary' if len(labels) == 2 else 'macro'
    prec = precision_score(y_true, y_pred, average=avg, zero_division=0)
    rec  = recall_score(y_true, y_pred, average=avg, zero_division=0)
    f1   = f1_score(y_true, y_pred, average=avg, zero_division=0)
    bal  = balanced_accuracy_score(y_true, y_pred)
    cm, spec, sens = compute_metrics(y_true, y_pred, labels)
    return acc, loss, prec, rec, f1, bal, spec, sens, cm

# --- Hyperparameter grid ---
iterations_list = [50, 100, 200, 400, 800]       # max_iter
depth_list = [3, 5, 7, 10, None]                 # max_depth
all_results = []

for dep in depth_list:
    for ite in iterations_list:
        print(f"\nRunning: {ite} iterations for depth {dep}")
        model = HistGradientBoostingClassifier(
            max_iter=ite,
            max_depth=dep,
            learning_rate=0.1,
            l2_regularization=0.0,
            random_state=SEED
        )
        model.fit(X_train, y_train)

        preds_train = model.predict(X_train)
        preds_val = model.predict(X_val)
        preds_test = model.predict(X_test)
        probs_train = model.predict_proba(X_train)
        probs_val = model.predict_proba(X_val)
        probs_test = model.predict_proba(X_test)

        print("  preds_train:", preds_train.shape, "y_train:", y_train.shape, "probs_train:", probs_train.shape)
        print("  preds_val:", preds_val.shape, "y_val:", y_val.shape, "probs_val:", probs_val.shape)
        print("  preds_test:", preds_test.shape, "y_test:", y_test.shape, "probs_test:", probs_test.shape)

        ta, tl, _, _, _, _, _, _, _ = get_metrics(y_train, preds_train, probs_train, labels)
        va, vl, _, _, _, _, _, _, _ = get_metrics(y_val, preds_val, probs_val, labels)
        test_acc, test_loss, prec, rec, f1, bal_acc, spec, sens, cm = get_metrics(
            y_test, preds_test, probs_test, labels)

        all_results.append({
            'Model': 'HistGradientBoosting', 'depth': dep, 'iterations': ite,
            'train_accuracy': ta, 'val_accuracy': va, 'test_accuracy': test_acc,
            'train_loss': tl, 'val_loss': vl, 'test_loss': test_loss,
            'precision': prec, 'recall': rec, 'f1': f1,
            'balanced_accuracy': bal_acc, 'specificity': spec, 'sensitivity': sens
        })

        # --- Save confusion matrix ---
        fname = f"dep{dep}_ite{ite}"
        cm_path = os.path.join(cm_dir, fname + ".png")
        plt.figure(figsize=(4,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=labels, yticklabels=labels)
        plt.xlabel('Predicted'); plt.ylabel('Actual')
        plt.title(f"CM – depth={dep}, iterations={ite}")
        plt.tight_layout(); plt.savefig(cm_path); plt.close()

        # --- Save random sample of 15 test predictions ---
        sample = df_test.copy()
        sample['predicted_clearance_status'] = [labels[p] for p in preds_test]
        cols = list(sample.columns)
        cols.insert(cols.index('clearance_status')+1, cols.pop(cols.index('predicted_clearance_status')))
        sample[cols].sample(15).to_csv(
            os.path.join(test_pred_dir, fname + "_predicted.csv"), index=False
        )

        print(f"Train Acc={ta:.4f} | Test Acc={test_acc:.4f}")
        print(f"Saved CM at {cm_path}, preds at {os.path.join(test_pred_dir, fname + '_predicted.csv')}")

# --- Save summary ---
df_all = pd.DataFrame(all_results)
summary_path = os.path.join(perf_dir, "HistGradientBoosting_all_results.xlsx")
df_all.to_excel(summary_path, index=False)
print(f"\nAll hyperparameter runs complete. Summary at {summary_path}")



Split sizes:
  X_train (93870, 36) y_train (93870,)
  X_val (23468, 36) y_val (23468,)
  X_test (10426, 36) y_test (10426,)

Running: 50 iterations for depth 3
  preds_train: (93870,) y_train: (93870,) probs_train: (93870, 2)
  preds_val: (23468,) y_val: (23468,) probs_val: (23468, 2)
  preds_test: (10426,) y_test: (10426,) probs_test: (10426, 2)
Train Acc=0.8831 | Test Acc=0.8603
Saved CM at /content/drive/MyDrive/Big Data Analysis and Project - a1906525/Results/HistGradientBoosting/Confusion Matrices/dep3_ite50.png, preds at /content/drive/MyDrive/Big Data Analysis and Project - a1906525/Results/HistGradientBoosting/Test Results/dep3_ite50_predicted.csv

Running: 100 iterations for depth 3
  preds_train: (93870,) y_train: (93870,) probs_train: (93870, 2)
  preds_val: (23468,) y_val: (23468,) probs_val: (23468, 2)
  preds_test: (10426,) y_test: (10426,) probs_test: (10426, 2)
Train Acc=0.8897 | Test Acc=0.8658
Saved CM at /content/drive/MyDrive/Big Data Analysis and Project - a190652