In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

Preprocessing the data, putting the different crimes in their own category, grouping by areaName, week start, and crm cd desc which is basically the type of crime commited

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/HighPerformanceMachineLearning/CrimeDatafrom2020toPresent.csv')

# Convert the date column to datetime
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])

# Create a new column for week start (Monday of the week)
df['WEEK_START'] = df['DATE OCC'].dt.to_period('W').apply(lambda r: r.start_time)

# -----------------------
# 🔹 Group Crime Categories
# -----------------------
crime_category_map = {
    # Burglary
    'BURGLARY': 'Burglary',
    'BURGLARY FROM VEHICLE': 'Burglary',
    'BURGLARY, ATTEMPTED': 'Burglary',
    'BURGLARY FROM VEHICLE, ATTEMPTED': 'Burglary',

    # Theft
    'SHOPLIFTING - PETTY THEFT ($950 & UNDER)': 'Theft',
    'SHOPLIFTING - ATTEMPT': 'Theft',
    'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)': 'Theft',
    'THEFT PLAIN - PETTY ($950 & UNDER)': 'Theft',
    'THEFT, PERSON': 'Theft',
    'PICKPOCKET': 'Theft',
    'PETTY THEFT - AUTO REPAIR': 'Theft',
    'BUNCO, PETTY THEFT': 'Theft',
    'BUNCO, GRAND THEFT': 'Theft',
    'EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)': 'Theft',
    'EMBEZZLEMENT, PETTY THEFT ($950 & UNDER)': 'Theft',
    'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)': 'Theft',
    'THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)': 'Theft',
    'THEFT OF IDENTITY': 'Theft',
    'THEFT PLAIN - ATTEMPT': 'Theft',

    # Assault
    'BATTERY - SIMPLE ASSAULT': 'Assault',
    'BATTERY POLICE (SIMPLE)': 'Assault',
    'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT': 'Assault',
    'INTIMATE PARTNER - AGGRAVATED ASSAULT': 'Assault',
    'INTIMATE PARTNER - SIMPLE ASSAULT': 'Assault',

    # Vehicle Theft
    'VEHICLE - STOLEN': 'Vehicle Theft',
    'VEHICLE - ATTEMPT STOLEN': 'Vehicle Theft',
    'DRIVING WITHOUT OWNER CONSENT (DWOC)': 'Vehicle Theft',

    # Sexual Offenses
    'RAPE, FORCIBLE': 'Sexual Offense',
    'RAPE, ATTEMPTED': 'Sexual Offense',
    'ORAL COPULATION': 'Sexual Offense',
    'SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH': 'Sexual Offense',
    'SEXUAL PENETRATION W/FOREIGN OBJECT': 'Sexual Offense',
    'LEWD/LASCIVIOUS ACTS WITH CHILD': 'Sexual Offense',
    'SEX,UNLAWFUL(INC MUTUAL CONSENT, PENETRATION W/ FRGN OBJ': 'Sexual Offense',

    # Homicide
    'CRIMINAL HOMICIDE': 'Homicide',
    'MANSLAUGHTER, NEGLIGENT': 'Homicide',

    # Vandalism
    'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)': 'Vandalism',
    'VANDALISM - MISDEAMEANOR ($399 OR UNDER)': 'Vandalism',

    # Robbery
    'ROBBERY': 'Robbery',
    'ATTEMPTED ROBBERY': 'Robbery',

    # Arson
    'ARSON': 'Arson',

    # Threats
    'CRIMINAL THREATS - NO WEAPON DISPLAYED': 'Threats',
    'THREATENING PHONE CALLS/LETTERS': 'Threats',

    # Weapons
    'WEAPONS POSSESSION/BOMBING': 'Weapons',
    'BRANDISH WEAPON': 'Weapons',

    # Fraud
    'CREDIT CARDS, FRAUD USE ($950 & UNDER': 'Fraud',
    'CREDIT CARDS, FRAUD USE ($950.01 & OVER)': 'Fraud',
    'DOCUMENT FORGERY / STOLEN FELONY': 'Fraud',

    # Other
    # You can expand this section as needed
}

# Apply mapping
df['Crime Category'] = df['Crm Cd Desc'].map(crime_category_map).fillna('Other')

# -----------------------
# 🔹 Weekly Grouped & Pivot
# -----------------------
weekly_grouped = (
    df.groupby(['AREA NAME', 'WEEK_START', 'Crime Category'])
    .size()
    .reset_index(name='count')
)

weekly_pivot = weekly_grouped.pivot_table(
    index=['AREA NAME', 'WEEK_START'],
    columns='Crime Category',
    values='count',
    fill_value=0
)

weekly_pivot['Total Crimes'] = weekly_pivot.sum(axis=1)

weekly_pivot = weekly_pivot.reset_index()

Creating our Datasets class

In [None]:
import torch
from torch.utils.data import Dataset

class CrimeSequenceDataset(Dataset):
    def __init__(self, df, input_window=4):
        self.input_window = input_window
        self.areas = df['AREA NAME'].unique()
        self.inputs = []
        self.targets = []
        self.feature_cols = [col for col in df.columns if col not in ['AREA NAME', 'WEEK_START']]

        # Build sequences per area
        for area in self.areas:
            area_df = df[df['AREA NAME'] == area].sort_values('WEEK_START')
            feature_data = area_df[self.feature_cols].values

            for i in range(len(feature_data) - input_window):
                x_seq = feature_data[i:i + input_window]
                y_seq = feature_data[i + input_window]

                self.inputs.append(x_seq)
                self.targets.append(np.log1p(y_seq))  # Apply log1p to target

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        x = torch.tensor(self.inputs[idx], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32)
        return x, y



Instantiate the Dataset

In [None]:
#Replace 'weekly_pivot' with you DataFrame if named differently
input_window = 4
dataset = CrimeSequenceDataset(weekly_pivot, input_window = input_window)

Creating an LSTM model utilizing PyTorch

In [None]:
import torch
import torch.nn as nn

class CrimeLSTMModel(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.2):
    super(CrimeLSTMModel, self).__init__()
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    #X shape: (batch_size, sequence_length, input_size)
    out, _ = self.lstm(x) #Output shape: (batch, seq_len, hidden_size)
    out = out[:, -1, :] #Get the last output in the sequence
    out = self.fc(out) #Final prediction
    return out

CUDA Training Loop in PyTorch
This loop will:
* Send inputs and targets to GPU (cuda)
* Train using MSELoss
* Track training loss over epochs


In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

# === Setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

# Auto-get input/output sizes from sample
sample_x, _ = dataset[0]
input_size = sample_x.shape[1]
output_size = input_size
hidden_size = 128
batch_size = 64
epochs = 300

# === DataLoader ===
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# === Loss Function ===
criterion = nn.MSELoss()

# ================================
# 🔁 Train Parallel Model
# ================================
import time

# ================================
# 🔁 Train Parallel Model with Timing
# ================================
print("\n🔁 Training Parallel Model")
model_parallel = CrimeLSTMModel(input_size, hidden_size, output_size)

if torch.cuda.device_count() > 1:
    print("🚀 Using", torch.cuda.device_count(), "GPUs with DataParallel")
    model_parallel = nn.DataParallel(model_parallel)

model_parallel = model_parallel.to(device)
optimizer_parallel = optim.Adam(model_parallel.parameters(), lr=0.001)

start_time = time.time()  # ⏱️ Start timing

for epoch in range(epochs):
    model_parallel.train()
    running_loss = 0.0

    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer_parallel.zero_grad()
        outputs = model_parallel(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer_parallel.step()
        running_loss += loss.item() * x_batch.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"[Parallel] Epoch {epoch+1}/{epochs} Loss: {epoch_loss:.4f}")

end_time = time.time()  # ⏱️ Stop timing
print(f"⏱️ Total training time for Parallel Model: {end_time - start_time:.2f} seconds")

# ================================
# 🔁 Train Non-Parallel Model with Timing
# ================================
print("\n🔁 Training Non-Parallel Model")
model_sequential = CrimeLSTMModel(input_size, hidden_size, output_size).to(device)
optimizer_seq = optim.Adam(model_sequential.parameters(), lr=0.001)

start_time = time.time()  # ⏱️ Start timing

for epoch in range(epochs):
    model_sequential.train()
    running_loss = 0.0

    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer_seq.zero_grad()
        outputs = model_sequential(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer_seq.step()
        running_loss += loss.item() * x_batch.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"[Sequential] Epoch {epoch+1}/{epochs} Loss: {epoch_loss:.4f}")

end_time = time.time()  # ⏱️ Stop timing
print(f"⏱️ Total training time for Non-Parallel Model: {end_time - start_time:.2f} seconds")


✅ Using device: cuda

🔁 Training Parallel Model
[Parallel] Epoch 1/300 Loss: 1.5727
[Parallel] Epoch 2/300 Loss: 0.1575
[Parallel] Epoch 3/300 Loss: 0.1485
[Parallel] Epoch 4/300 Loss: 0.1453
[Parallel] Epoch 5/300 Loss: 0.1438
[Parallel] Epoch 6/300 Loss: 0.1423
[Parallel] Epoch 7/300 Loss: 0.1414
[Parallel] Epoch 8/300 Loss: 0.1410
[Parallel] Epoch 9/300 Loss: 0.1401
[Parallel] Epoch 10/300 Loss: 0.1394
[Parallel] Epoch 11/300 Loss: 0.1391
[Parallel] Epoch 12/300 Loss: 0.1385
[Parallel] Epoch 13/300 Loss: 0.1379
[Parallel] Epoch 14/300 Loss: 0.1375
[Parallel] Epoch 15/300 Loss: 0.1375
[Parallel] Epoch 16/300 Loss: 0.1373
[Parallel] Epoch 17/300 Loss: 0.1365
[Parallel] Epoch 18/300 Loss: 0.1368
[Parallel] Epoch 19/300 Loss: 0.1362
[Parallel] Epoch 20/300 Loss: 0.1362
[Parallel] Epoch 21/300 Loss: 0.1357
[Parallel] Epoch 22/300 Loss: 0.1353
[Parallel] Epoch 23/300 Loss: 0.1356
[Parallel] Epoch 24/300 Loss: 0.1356
[Parallel] Epoch 25/300 Loss: 0.1358
[Parallel] Epoch 26/300 Loss: 0.1357

Next step, prdict the next week of crimes for each AREA NAME, We'll
* Grab the last N weeks for each AREA
* Predict the next week's crime count vector
* Optionally visualize the output

In [None]:
import numpy as np

def predict_next_week(model, df, input_window=4):
    model.eval()
    areas = df['AREA NAME'].unique()
    feature_cols = [col for col in df.columns if col not in ['AREA NAME', 'WEEK_START']]

    predictions = {}

    with torch.no_grad():
        for area in areas:
            area_df = df[df['AREA NAME'] == area].sort_values('WEEK_START')
            recent_data = area_df[feature_cols].values[-input_window:]

            if len(recent_data) < input_window:
                continue

            x = torch.tensor(recent_data, dtype=torch.float32).unsqueeze(0).to("cuda")
            y_pred = model(x).cpu().numpy().flatten()

            y_pred = np.expm1(y_pred)              # Convert back from log-space
            y_pred = np.clip(y_pred, 0, None)      # Clamp to avoid negatives
            y_pred = y_pred.round().astype(int)    # Round to integers

            crime_pred = dict(zip(feature_cols, y_pred))
            predictions[area] = crime_pred

    return predictions


In [None]:
crime_forecast = predict_next_week(model, weekly_pivot, input_window=20)

#Example printout
for area, pred in list(crime_forecast.items())[:]: #First 5 areas
  print(f'\n {area}')
  for crime, count in pred.items():
    print(f' - {crime}: {count}')


 77th Street
 - Arson: 0
 - Assault: 0
 - Burglary: 1
 - Fraud: 0
 - Homicide: 0
 - Other: 0
 - Robbery: 0
 - Sexual Offense: 0
 - Theft: 2
 - Threats: 0
 - Vandalism: 1
 - Vehicle Theft: 1
 - Weapons: 0
 - Total Crimes: 5

 Central
 - Arson: 0
 - Assault: 0
 - Burglary: 13
 - Fraud: 0
 - Homicide: 0
 - Other: 3
 - Robbery: 0
 - Sexual Offense: 0
 - Theft: 7
 - Threats: 0
 - Vandalism: 5
 - Vehicle Theft: 1
 - Weapons: 0
 - Total Crimes: 21

 Devonshire
 - Arson: 0
 - Assault: 0
 - Burglary: 1
 - Fraud: 0
 - Homicide: 0
 - Other: 1
 - Robbery: 0
 - Sexual Offense: 0
 - Theft: 1
 - Threats: 0
 - Vandalism: 0
 - Vehicle Theft: 1
 - Weapons: 0
 - Total Crimes: 5

 Foothill
 - Arson: 0
 - Assault: 0
 - Burglary: 1
 - Fraud: 0
 - Homicide: 0
 - Other: 0
 - Robbery: 0
 - Sexual Offense: 0
 - Theft: 1
 - Threats: 0
 - Vandalism: 0
 - Vehicle Theft: 1
 - Weapons: 0
 - Total Crimes: 4

 Harbor
 - Arson: 0
 - Assault: 0
 - Burglary: 3
 - Fraud: 0
 - Homicide: 0
 - Other: 1
 - Robbery: 0
 - Sexu