<a href="https://colab.research.google.com/github/burgerhaley97/Crime-Location-Prediction-Project/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Import/Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/6655_HWs/Project

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
import pandas as pd

# Import csv
df = pd.read_csv('CrimeData.csv')
df.head()

In [None]:
# keep only relevant columns
df = df.drop(['Apartment Number', 'x', 'y', 'IDCol', 'Report Number', 'ObjectId',
              'Day Occurred', 'Possible Date', 'Possible Time', 'NIBRS Code'], axis=1)
df.head()

In [None]:
'''
Look at missing values per variable
'''
import matplotlib.pyplot as plt

# Count missing values per column
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
missing_counts.plot(kind='bar')
plt.title('Missing Values per Column')
plt.ylabel('Number of Missing Values')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Show rows where Neighborhood is missing
df[df['Neighborhood'].isna()].head()


In [None]:
'''
Impute missing neighborhood values based on proximinty to other neighborhood
centroids.
'''
# Drop rows with missing Neighborhoods to build the reference table
known_locations = df.dropna(subset=['Neighborhood'])

# Calculate average lat/lon for each neighborhood
neighborhood_centroids = known_locations.groupby('Neighborhood')[['Latitude', 'Longitude']].mean()



In [None]:
from scipy.spatial.distance import cdist
import numpy as np

def impute_neighborhood(lat, lon, centroids):
    distances = cdist([[lat, lon]], centroids[['Latitude', 'Longitude']].values)
    min_idx = np.argmin(distances)
    return centroids.index[min_idx]


In [None]:
# Get the rows with missing neighborhoods
missing_rows = df[df['Neighborhood'].isna()]

# Apply the imputation function
df.loc[df['Neighborhood'].isna(), 'Neighborhood'] = missing_rows.apply(
    lambda row: impute_neighborhood(row['Latitude'], row['Longitude'], neighborhood_centroids),
    axis=1
)


In [None]:
print("Still missing Neighborhoods:", df['Neighborhood'].isna().sum())

In [None]:
'''
Impute missing values for NPU in a similar way since these are another type
of neihgborhood unit.
'''

In [None]:
# Keep only rows where NPU is not missing
known_npu = df.dropna(subset=['NPU'])

# Compute average lat/lon for each NPU
npu_centroids = known_npu.groupby('NPU')[['Latitude', 'Longitude']].mean()


In [None]:
from scipy.spatial.distance import cdist
import numpy as np

def impute_npu(lat, lon, centroids):
    distances = cdist([[lat, lon]], centroids[['Latitude', 'Longitude']].values)
    closest_idx = np.argmin(distances)
    return centroids.index[closest_idx]


In [None]:
missing_npu_rows = df[df['NPU'].isna()]

# Impute using nearest centroid
df.loc[df['NPU'].isna(), 'NPU'] = missing_npu_rows.apply(
    lambda row: impute_npu(row['Latitude'], row['Longitude'], npu_centroids),
    axis=1
)


In [None]:
print("Still missing NPUs:", df['NPU'].isna().sum())

In [None]:
'''
Remove rows that have missing occur dates / times since there are very few.
'''

In [None]:
df = df.dropna(subset=['Occur Date', 'Occur Time'])


In [None]:
print("Still missing Occur Date:", df['Occur Date'].isna().sum())

In [None]:
'''
Create Crime Frequency Feature:
'''

In [None]:
# create the crime frequency per neighborhood feature
df['Occur Date'] = pd.to_datetime(df['Occur Date'], format='mixed', errors='coerce')
df = df.dropna(subset=['Neighborhood'])

# Count number of crimes per neighborhood per day
daily_counts = df.groupby(['Neighborhood', 'Occur Date']).size().reset_index(name='Daily_Crime_Count')

# Now compute average daily crime per neighborhood
avg_daily_crime = daily_counts.groupby('Neighborhood')['Daily_Crime_Count'].mean().reset_index(name='Avg_Crime_Per_Day')
df = df.merge(avg_daily_crime, on='Neighborhood', how='left')
df.head()

In [None]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


In [None]:
df = df.dropna(subset=['Occur Date'])

In [None]:
# How many rows have missing Avg_Crime_Per_Day?
print("Rows with missing Avg_Crime_Per_Day:", df['Avg_Crime_Per_Day'].isna().sum())


In [None]:
# Check rows with NaT in 'Occur Time' or 'Occur Date'
invalid_time_rows = df[pd.to_datetime(df['Occur Time'], format='%H:%M', errors='coerce').isna()]
invalid_date_rows = df[pd.to_datetime(df['Occur Date'], errors='coerce').isna()]

print("Problematic 'Occur Time' values:")
print(invalid_time_rows['Occur Time'].value_counts())

print("Problematic 'Occur Date' values:")
print(invalid_date_rows['Occur Date'].value_counts())


In [None]:
# Filter out rows where Occur Time is invalid (i.e., can't be parsed)
valid_times = pd.to_datetime(df['Occur Time'], format='%H:%M', errors='coerce').notna()

# Keep only rows with valid Occur Time
df = df[valid_times]

# Now safely re-run the time conversion and feature extraction
df['Occur Time'] = pd.to_datetime(df['Occur Time'], format='%H:%M')
df['hour'] = df['Occur Time'].dt.hour
df['dayofweek'] = df['Occur Date'].dt.dayofweek
df['month'] = df['Occur Date'].dt.month


In [None]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
df.head()

# Make New Crime Types

In [None]:

'''
Reclassify into our broader 3 crime types. Do SMOTE on this data.

'''

In [None]:
import pandas as pd

# Define your mapping
crime_mapping = {
    'LARCENY-FROM VEHICLE': 'Theft',
    'LARCENY-NON VEHICLE': 'Theft',
    'AUTO THEFT': 'Theft',
    'BURGLARY': 'Burglary',
    'AGG ASSAULT': 'Violence',
    'ROBBERY': 'Violence',
    'HOMICIDE': 'Violence'
}

# Create a new column with the mapped class
df['Crime Category'] = df['Crime Type'].map(crime_mapping)


In [None]:
print(df['Crime Category'].value_counts())



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='Crime Category', order=df['Crime Category'].value_counts().index, palette="Set2")
plt.title("Frequency of Crime Categories")
plt.xlabel("Count")
plt.ylabel("Crime Category")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# 1. Define your features and target
X = df.drop(columns=['Crime Type', 'Crime Category', 'Occur Time', 'Occur Date', 'Report Date'])
y = df['Crime Category']

# 2. Check current class distribution
print("Original class distribution:")
print(y.value_counts())



In [None]:
X.head()

In [None]:
# 3. Apply SMOTE to upsample Burglary and Violence to 70% of Theft
# Determine counts
counts = y.value_counts()
n_theft = counts['Theft']
target_n = int(0.7 * n_theft)  # 70% of Theft

# Create a dictionary for desired sampling strategy
sampling_strategy = {
    'Burglary': target_n,
    'Violence': target_n
}

# 4. Apply SMOTE
# Encode location columns
X_encoded = pd.get_dummies(X, columns=['Neighborhood', 'NPU', 'Zone'], drop_first=True)

# Drop the original string columns manually
X_encoded = X_encoded.drop(columns=['Location'])


In [None]:
X_encoded.head()

In [None]:
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=1891)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# 5. Check new class distribution
print("\nAfter SMOTE class distribution:")
print(pd.Series(y_resampled).value_counts())

In [None]:
# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

# EDA

In [None]:
df['Crime Type'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='Crime Type', order=df['Crime Type'].value_counts().index, palette="Set2")
plt.title("Frequency of Crime Types")
plt.xlabel("Count")
plt.ylabel("Crime Type")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Ensure date is parsed correctly
df['Report Date'] = pd.to_datetime(df['Report Date'])

# Step 2: Extract the year
df['Year'] = df['Report Date'].dt.year

# Step 3: Filter for just 2019 and 2020 (optional if there are other years)
year_counts = df[df['Year'].isin([2019, 2020])]['Year'].value_counts().sort_index()

# Step 4: Plot it
year_counts.plot(kind='bar', color='cornflowerblue', edgecolor='black', figsize=(8, 6))
plt.title("Total Crimes by Year (2019 & 2020)")
plt.xlabel("Year")
plt.ylabel("Number of Crimes")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Ensure date column is in datetime format
df['Report Date'] = pd.to_datetime(df['Report Date'])

# Step 2: Extract day of the week (Monday = 0, Sunday = 6)
df['Day of Week'] = df['Report Date'].dt.day_name()

# Step 3: Create the frequency table
day_counts = df['Day of Week'].value_counts().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])

# Step 4: Display the table
print(day_counts.to_frame(name='Total Crimes'))

day_counts.plot(kind='bar', color='mediumpurple', edgecolor='black', figsize=(8, 6))
plt.title("Total Crimes by Day of the Week")
plt.ylabel("Number of Crimes")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create a heat map type plot
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df,
    x="Longitude",
    y="Latitude",
    hue="crime_type_encoded",
    palette="Set2",
    s=100,
    edgecolor="black"
)
plt.title("Crime Incidents by Location and Type")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title="crime_type_encoded", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import folium
from folium.plugins import MarkerCluster
import pandas as pd

# Initialize the base map centered around the average coordinates
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=12)

# Define a color map for encoded crime types (0 to 6)
crime_colors = {
    0: 'red',       # e.g., BURGLARY
    1: 'blue',      # e.g., LARCENY-FROM VEHICLE
    2: 'orange',    # e.g., AUTO THEFT
    3: 'purple',    # e.g., LARCENY-NON VEHICLE
    4: 'green',     # assign colors as you like
    5: 'brown',
    6: 'pink'
}


# Add clustered markers colored by crime type
marker_cluster = MarkerCluster().add_to(m)
for _, row in df.iterrows():
    folium.CircleMarker(
        location=(row['Latitude'], row['Longitude']),
        radius=5,
        color=crime_colors.get(row['crime_type_encoded'], 'gray'),
        fill=True,
        fill_opacity=0.7,
        popup=row['crime_type_encoded']
    ).add_to(marker_cluster)

m


In [None]:
from folium.plugins import HeatMap

# Heatmap layer using lat/lon only
heat_df = df[['Latitude', 'Longitude']]

m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=12)
HeatMap(heat_df.values, radius=15).add_to(m)
m


In [None]:
import folium
from folium.plugins import HeatMap

# Create the base map
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=12)

# Add the heatmap
heat_df = df[['Latitude', 'Longitude']]
HeatMap(heat_df.values, radius=15).add_to(m)

# Add custom HTML legend
legend_html = """
<div style="
    position: fixed;
    bottom: 50px; left: 50px; width: 180px; height: 90px;
    background-color: white;
    border:2px solid grey;
    z-index:9999;
    font-size:14px;
    padding: 10px;">
    <b>Crime Density</b><br>
    <i style='background: red; width: 10px; height: 10px; display: inline-block;'></i> High<br>
    <i style='background: orange; width: 10px; height: 10px; display: inline-block;'></i> Medium<br>
    <i style='background: blue; width: 10px; height: 10px; display: inline-block;'></i> Low
</div>
"""

m.get_root().html.add_child(folium.Element(legend_html))

# Display in notebook (works in Jupyter and Colab)
m


# Try a Balanced Random Forest

In [None]:
'''
Adjust for crazy class imbalance with random oversampling.
'''

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.preprocessing import StandardScaler

# Initialize encoder
le = LabelEncoder()

# Fit and transform your target column
y_encoded = le.fit_transform(y_resampled)

# Optional: see mapping of labels to encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)


# Step 1: Split off 15% for the test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X_resampled_scaled, y_encoded, test_size=0.15, random_state=1891
)

# Step 2: Split the remaining 85% into train (70%) and validation (15%)
# To get a 15% validation from the remaining 85%, use 0.1765 as the test_size:
# (0.1765 * 0.85 ≈ 0.15 overall validation split)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=1891
)

In [None]:
# retrain on the best parameters
from imblearn.ensemble import BalancedRandomForestClassifier

# Create the final model with best-found parameters
final_model = BalancedRandomForestClassifier(
    sampling_strategy='auto',
    n_estimators=200,
    min_samples_split=2,
    max_features='sqrt',
    max_depth=10,
    random_state=1891,
    n_jobs=-1
)

# Train the final model
final_model.fit(X_train, y_train)



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = final_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# LabelEncoder used earlier
label_mapping = dict(zip(le.transform(le.classes_), le.classes_))

# Predict
y_pred = final_model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get label names in the same order as their encoded values
labels = [label_mapping[i] for i in sorted(label_mapping)]

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# Classification report with target names
print(classification_report(y_test, y_pred, target_names=labels))


# NN Model Building/Training

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Initialize encoder
le = LabelEncoder()

# Fit and transform your target column
y_encoded = le.fit_transform(y_resampled)

# Optional: see mapping of labels to encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)


# Step 1: Split off 15% for the test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X_resampled_scaled, y_encoded, test_size=0.15, random_state=1891
)

# Step 2: Split the remaining 85% into train (70%) and validation (15%)
# To get a 15% validation from the remaining 85%, use 0.1765 as the test_size:
# (0.1765 * 0.85 ≈ 0.15 overall validation split)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=1891
)


In [None]:

# Build the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [None]:
# Print the learning rate used by Adam
print("Learning rate:", model.optimizer.learning_rate.numpy())


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute weights for each class
class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

# Convert to dict format expected by model.fit()
class_weights = {i : w for i, w in enumerate(class_weights_array)}

print("Class weights:", class_weights)


In [None]:
# Train and store history
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    class_weight=class_weights
)

In [None]:
# Plot accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training vs Validation Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
'''
Try a deeper model with dropout and batch normalization (no class weights).

'''

In [None]:
!pip install keras-tuner

In [None]:
import shutil
shutil.rmtree('tuner_results/lr_batchsize_tuning')  # Deletes old tuner state

# Then rerun tuner = RandomSearch(...) and tuner.search(...) as usual


In [None]:
from keras_tuner import RandomSearch, HyperParameters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from keras_tuner import RandomSearch
import tensorflow as tf

# Define model-building function
def build_model(hp):
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(3, activation='softmax')
    ])

    # Tune learning rate
    lr = hp.Choice('learning_rate', values=[1e-4, 3e-4, 1e-3, 3e-3])
    optimizer = Adam(learning_rate=lr)

    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Set up tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_results',
    project_name='lr_batchsize_tuning'
)

# Define a separate HP space for batch_size tuning
hp = HyperParameters()
batch_size = hp.Choice('batch_size', values=[16, 32, 64, 128])

# Search
tuner.search(X_train, y_train,
             epochs=8,
             validation_split=0.1,
             batch_size=batch_size,
             class_weight=class_weights)

# Get best model and hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters()[0]

print("Best learning rate:", best_hp.get('learning_rate'))


In [None]:
# Rebuild the model using the best hyperparameters
final_model = build_model(best_hp)

history = final_model.fit(
    X_train, y_train,
    epochs=50,
    validation_split=0.1,
    batch_size=32,
    class_weight=class_weights
)



In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Evaluate model on test data
test_loss, test_accuracy = final_model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

# Predict class probabilities
y_pred_probs = final_model.predict(X_test)

# Convert probabilities to class predictions
y_pred = np.argmax(y_pred_probs, axis=1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get index-to-label mapping from label encoder
label_map = {'Theft': 0, 'Burglary': 1, 'Violence': 2}
index_to_label = {v: k for k, v in label_map.items()}
labels = [index_to_label[i] for i in sorted(index_to_label)]

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# Classification report
print(classification_report(y_test, y_pred, target_names=labels))
