In [1]:
import torch
import time
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import multiprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 🚀 Enable GPU & Optimizations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/HighPerformanceMachineLearning/CrimeDatafrom2020toPresent.csv')

# Convert DATE OCC to datetime format
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce')

# Extract time features
df['Year_OCC'] = df['DATE OCC'].dt.year
df['Month_OCC'] = df['DATE OCC'].dt.month
df['Day_OCC'] = df['DATE OCC'].dt.day
df['Hour_OCC'] = df['TIME OCC'] // 100  # Convert HHMM to hours

# Count crimes per location
crime_counts = df.groupby(['LAT', 'LON']).size()

# Assign Crime Risk Scores (1-10) and scale to 1-5
df['Crime_Risk'] = df.set_index(['LAT', 'LON']).index.map(lambda x: crime_counts.get(x, 0))
df['Crime_Risk'] = np.ceil(10 * (df['Crime_Risk'] / df['Crime_Risk'].max())).astype(int)
df['Crime_Risk'] = df['Crime_Risk'].clip(1, 10)
df['Crime_Risk'] = np.ceil(df['Crime_Risk'] / 2).astype(int)

# Normalize Crime_Risk to 0-1
df['Crime_Risk'] = (df['Crime_Risk'] - 1) / 4

# Normalize features
features = ['Year_OCC', 'Month_OCC', 'Day_OCC', 'Hour_OCC', 'LAT', 'LON']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Train-Test Split
train_df, _ = train_test_split(df, test_size=0.2, random_state=42)

# Define Dataset
class CrimeDatasetRegression(Dataset):
    def __init__(self, data):
        self.x = torch.tensor(data[features].values, dtype=torch.float32)
        self.y = torch.tensor(data['Crime_Risk'].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# Define Regression Model
class CrimeRiskRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CrimeRiskRegression, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        return torch.sigmoid(x)

# 🔁 Parallel Experiment Setup
num_worker_settings = [0, 2, 4, 8]  # Different levels of parallelism
batch_sizes = [128, 256, 512]  # Larger batches to reduce iterations
hidden_dim = 256
results = []

# 🔥 Define dynamic epochs for faster training
epoch_map = {128: 10, 256: 7, 512: 5}

# Loop for Different Parallelism Configurations
for num_workers in num_worker_settings:
    for batch_size in batch_sizes:
        epochs = epoch_map[batch_size]  # Dynamically set epochs
        start_time = time.time()

        # Init model, loss, optimizer
        model = CrimeRiskRegression(input_dim=6, hidden_dim=hidden_dim).to(device)
        criterion = nn.HuberLoss(delta=1.0)
        optimizer = optim.Adam(model.parameters(), lr=0.005)

        # Dataset and Loader
        train_dataset = CrimeDatasetRegression(train_df)
        train_loader = DataLoader(
            train_dataset, batch_size=batch_size, shuffle=True,
            num_workers=num_workers, pin_memory=True if num_workers > 0 else False
        )

        # Training Loop
        total_loss = 0
        sample_count = 0
        for epoch in range(epochs):
            model.train()
            epoch_loss = 0
            for x_batch, y_batch in train_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                optimizer.zero_grad()
                outputs = model(x_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * x_batch.size(0)
                sample_count += x_batch.size(0)
            total_loss = epoch_loss / sample_count

        elapsed_time = time.time() - start_time

        results.append((
            batch_size, num_workers,
            hidden_dim, round(elapsed_time, 3), round(total_loss, 6),
            sample_count
        ))

# Save Results
df_results = pd.DataFrame(results, columns=[
    "Batch Size", "Workers", "Hidden Dim", "Time (s)",
    "Avg Loss (Last Epoch)", "Samples"
])
df_results.to_csv("performance_results.csv", index=False)

print("✅ All experiments completed. Results saved to 'performance_results.csv'.")




✅ All experiments completed. Results saved to 'performance_results.csv'.


In [2]:
from google.colab import files

files.download("/content/performance_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>