In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


In [6]:

# Load data
file_path = '/content/crime_2025.csv'
df = pd.read_csv(file_path)

# Drop rows with missing values initially
df.dropna(inplace=True)

In [7]:
# Convert Month_Year to datetime and extract components
df['Month_Year'] = pd.to_datetime(df['Month_Year'], format='%d/%m/%Y', errors='coerce')
df['Year'] = df['Month_Year'].dt.year
df['Month'] = df['Month_Year'].dt.month
df['Day'] = df['Month_Year'].dt.day

# Function to process Financial Year and FY Index
def process_fy_fyindex(fy_index):
    match = re.match(r'(\d{2}-\d{2})_(\d+)', str(fy_index))
    if match:
        fy_raw = match.group(1)
        index = int(match.group(2))
        fy_match = re.match(r'(\d{2})-(\d{2})', fy_raw)
        if fy_match:
            fy_start = int("20" + fy_match.group(1))
            fy_end = int("20" + fy_match.group(2))
            return f"{fy_start}-{fy_end}", fy_start, fy_end, index
    return None, None, None, None

In [8]:
# Apply FY processing
df[['Financial Year Cleaned', 'FY_Start', 'FY_End', 'FY_Index']] = df['FY_FYIndex'].apply(
    lambda x: pd.Series(process_fy_fyindex(x))
)

# Convert categorical columns
categorical_columns = ['Area Type', 'Borough_SNT', 'Area name', 'Area code',
                       'Offence Group', 'Offence Subgroup', 'Measure']
for col in categorical_columns:
    df[col] = df[col].astype(str)  # Ensure type is consistent for encoding

# Store encoders for decoding later
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Ensure Count is numeric
df['Count'] = pd.to_numeric(df['Count'], errors='coerce')
df.dropna(subset=['Count'], inplace=True)  # Drop if Count couldn't be converted

In [9]:
# Remove duplicates and reset index
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns, including 'FY_FYIndex'
drop_columns = ["Month_Year", "Refresh Date", "Financial Year Cleaned", "FY_Start", "FY_End", "FY_Index", "FY_FYIndex"]
df.drop(columns=drop_columns, inplace=True, errors='ignore')

# Create date-based features
df["Date"] = pd.to_datetime(df[["Year", "Month", "Day"]])
df["Year_Month"] = df["Year"] + df["Month"] / 12.0

# Define features and target
X = df.drop(columns=["Count", "Date"])
y = df["Count"]


# Explicitly select only numerical features for scaling
X = X.select_dtypes(include=['number'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (no shuffle to simulate time series)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=False)


In [10]:
# --- KNN MODEL ---
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN MAE:", mean_absolute_error(y_test, y_pred_knn))
print("KNN R² Score:", r2_score(y_test, y_pred_knn))

KNN MAE: 5.813739426429623
KNN R² Score: 0.46969322944831593


In [11]:
# --- RANDOM FOREST MODEL ---
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest R² Score:", r2_score(y_test, y_pred_rf))


Random Forest R² Score: 0.9470105863706114


In [12]:
# --- FUTURE PREDICTION FOR 2025 ---
latest_year = df["Year"].max()
future_data = df[df["Year"] == latest_year].copy()
future_data["Year"] = 2025

# Select only numerical features for future data as well
future_X = future_data.drop(columns=["Count", "Date"]).select_dtypes(include=['number'])

future_X = scaler.transform(future_X)

# Predict using Random Forest
future_predictions = rf.predict(future_X)
future_data["Predicted Crime Count"] = future_predictions

# Decode categorical columns
decoded_future_data = future_data.copy()
for col, le in label_encoders.items():
    if col in decoded_future_data.columns:
        decoded_future_data[col] = le.inverse_transform(decoded_future_data[col].astype(int))

# Add Year-Month
decoded_future_data["Year-Month"] = decoded_future_data["Year"].astype(str) + '-' + decoded_future_data["Month"].astype(str).str.zfill(2)

# Output columns
output_columns = ['Year', 'Month', 'Year-Month', 'Area name', 'Area Type',
                  'Borough_SNT', 'Area code', 'Offence Group', 'Predicted Crime Count']

# Print and save
print("\n--- Predicted Crime Data for 2025 ---")
print(decoded_future_data[output_columns].to_string(index=False))

# Save to CSV
output_filename = "/content/future_crime_predictions_2025.csv"
decoded_future_data[output_columns].to_csv(output_filename, index=False)

# Enable download in Colab
from google.colab import files
files.download(output_filename)


--- Predicted Crime Data for 2025 ---


In [18]:
# Decode Borough_SNT if it's still encoded
df['Borough_SNT_Decoded'] = label_encoders['Borough_SNT'].inverse_transform(df['Borough_SNT'])

# Group by decoded Borough_SNT and sum the Count
crime_by_borough = df.groupby('Borough_SNT_Decoded')['Count'].sum().sort_values(ascending=False)

# Display top Borough
print("Borough with the most crimes:")
print(crime_by_borough.head(1))


Borough with the most crimes:
Borough_SNT_Decoded
Westminster    326808
Name: Count, dtype: int64


In [19]:
# Save the current notebook (optional but safe)
import IPython
IPython.display.display(IPython.display.Javascript('IPython.notebook.save_checkpoint();'))


<IPython.core.display.Javascript object>

In [20]:
!jupyter nbconvert --to html "/content/prediction.ipynb"


[NbConvertApp] Converting notebook /content/prediction.ipynb to html
[NbConvertApp] Writing 310870 bytes to /content/prediction.html
