In [1]:
# Import the neccessary libraries
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report, precision_score

2024-01-18 20:56:59.934585: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
file_path = '/Users/brandonwashington/Downloads/Crime_Data_from_2020_to_Present_20240111.csv'
df = pd.read_csv(file_path)

In [3]:
df = df.loc[:50000]

In [5]:
df.head(60)

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,10304468,01/08/2020 12:00:00 AM,01/08/2020 12:00:00 AM,2230,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,...,AO,Adult Other,624.0,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,01/02/2020 12:00:00 AM,01/01/2020 12:00:00 AM,330,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,...,IC,Invest Cont,624.0,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,04/14/2020 12:00:00 AM,02/13/2020 12:00:00 AM,1200,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,...,AA,Adult Arrest,845.0,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,1730,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,IC,Invest Cont,745.0,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,415,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,,,,14400 TITUS ST,,34.2198,-118.4468
5,200100501,01/02/2020 12:00:00 AM,01/01/2020 12:00:00 AM,30,1,Central,163,1,121,"RAPE, FORCIBLE",...,IC,Invest Cont,121.0,998.0,,,700 S BROADWAY,,34.0452,-118.2534
6,200100502,01/02/2020 12:00:00 AM,01/02/2020 12:00:00 AM,1315,1,Central,161,1,442,SHOPLIFTING - PETTY THEFT ($950 & UNDER),...,IC,Invest Cont,442.0,998.0,,,700 S FIGUEROA ST,,34.0483,-118.2631
7,200100504,01/04/2020 12:00:00 AM,01/04/2020 12:00:00 AM,40,1,Central,155,2,946,OTHER MISCELLANEOUS CRIME,...,IC,Invest Cont,946.0,998.0,,,200 E 6TH ST,,34.0448,-118.2474
8,200100507,01/04/2020 12:00:00 AM,01/04/2020 12:00:00 AM,200,1,Central,101,1,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",...,IC,Invest Cont,341.0,998.0,,,700 BERNARD ST,,34.0677,-118.2398
9,200100509,01/04/2020 12:00:00 AM,01/04/2020 12:00:00 AM,2200,1,Central,192,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,,,,15TH,OLIVE,34.0359,-118.2648


In [6]:
df.shape

(764061, 28)

In [7]:
# Data Cleaning



lon_min, lon_max = -118.68, -118.15 # Removes crimes that occured outside of city bounds
lat_min, lat_max = 33.69, 34.35
df = df[(df['LON'] >= lon_min) & (df['LON'] <= lon_max) & (df['LAT'] >= lat_min) & (df['LAT'] <= lat_max)] # Remove rows where longitude and latitude are outside the specified bounds


df = df[~((df['LAT'] == 0) & (df['LON'] == 0))] # Now df will contain only the rows where either latitude or longitude is not 0.
df = df.drop_duplicates(subset='DR_NO', keep='first') # Now df will contain only the rows where each value in the first column is unique.
df = df.dropna(subset=['DATE OCC', 'LAT', 'LON']) # Now 'df' will contain only the rows where 'DATE OCC', 'LAT', and 'LON' do not have NaN values.
df.shape
# Find duplicated rows based on the first column
#duplicates = df[df.duplicated(subset='DR_NO', keep=False)]
#duplicates = duplicates.sort_values(by='DR_NO')

(591136, 28)

In [8]:
# Number of Crimes Committed by Day of the Week
df['DATE OCC'] = pd.to_datetime(df['DATE OCC']) # Convert 'DATE OCC' to datetime
df['Day of Week'] = df['DATE OCC'].dt.day_name()# Extract the day of the week and add it as a new column # The day_name() function returns the name of the day in a week (e.g., Monday, Tuesday)
df['Month'] = df['DATE OCC'].dt.month_name()
monthly_crime_count = df['Month'].value_counts().sort_index()
df['Months'] = df['DATE OCC'].dt.month
day_to_num = { 'Sunday': 1, 'Monday': 2, 'Tuesday': 3, 'Wednesday': 4, 'Thursday': 5, 'Friday': 6, 'Saturday': 7}
df['Days of Week'] = df['Day of Week'].map(day_to_num)# Apply the mapping to the 'Day of the Week' column
#Number of Crimes Committed by Hour of Day
crime_count = df['Day of Week'].value_counts() # Count the number of crimes for each day of the week
ordered_days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] # Sort the days in the order you prefer (optional, for better visualization)
crime_count = crime_count.reindex(ordered_days)
# Number of Crimes Committed by Day
df['Hour of Day'] = df['TIME OCC'] // 100
hourly_crime_count = df['Hour of Day'].value_counts().sort_index() # Count the number of crimes for each hour
crime_count_by_date = df['DATE OCC'].value_counts().sort_index()
# Number of Crimes Occurred per Year
df['Year'] = df['DATE OCC'].dt.year # Extract the year and create a new column for it
# Group by year and count the number of crimes
crime_count_by_year = df.groupby('Year').size()
crime_count_by_area = df['AREA NAME'].value_counts()
#df.to_csv('updated_dataframe.csv', index=False)

In [9]:
num_lat_cells = 30  # Change these based on your grid size
num_lon_cells = 30

# Define the size of each grid cell
lat_step = (lat_max - lat_min) / num_lat_cells  # num_lat_cells is the number of cells in latitude
lon_step = (lon_max - lon_min) / num_lon_cells  # num_lon_cells is the number of cells in longitude

# Creating the grid lines
lat_lines = np.linspace(lat_min, lat_max, num_lat_cells+1)
lon_lines = np.linspace(lon_min, lon_max, num_lon_cells+1)

# Function to assign each crime to a grid cell
def assign_to_grid(lat, lon):
    lat_index = int((lat - lat_min) / lat_step)
    lon_index = int((lon - lon_min) / lon_step)
    return lat_index, lon_index

In [10]:
# Apply the function to your DataFrame
df['Grid_Cell'] = df.apply(lambda row: assign_to_grid(row['LAT'], row['LON']), axis=1)

# Group by 'Grid_Cell' and count crimes in each cell
crime_counts = df.groupby('Grid_Cell').size()

# Filter out grid cells with no crimes
crime_cells = crime_counts[crime_counts > 0].index.tolist()

# Keep only rows with crimes in the relevant grid cells
df = df[df['Grid_Cell'].isin(crime_cells)]

In [11]:
# Sort the grid cells by their spatial order
sorted_cells = sorted(crime_cells, key=lambda x: (x[0], x[1]))  # Sort by latitude index, then longitude index

# Create a mapping from grid cell to enumeration
cell_to_enum = {cell: i for i, cell in enumerate(sorted_cells)}

# Apply enumeration to DataFrame
df['Grid_Cell_Enum'] = df['Grid_Cell'].map(cell_to_enum)

In [12]:
df = df.sort_values(by=['DATE OCC', 'Hour of Day'])
df = df.reset_index()

# Assuming df is your DataFrame
# Ensure 'DATE OCC' and 'Hour' are already defined as described earlier

# Convert 'grid_cell_enum' to one-hot encoding
encoder = OneHotEncoder(sparse=False)
grid_cell_one_hot = encoder.fit_transform(df[['Grid_Cell_Enum']])
# Convert the one-hot encoding to a DataFrame
grid_cell_one_hot_df = pd.DataFrame(grid_cell_one_hot, columns=[f'cell_{i}' for i in range(grid_cell_one_hot.shape[1])])
# Concatenate with the original DataFrame
df2 = pd.concat([df['DATE OCC'],df['Hour of Day'],df['Months'],df['Days of Week'], grid_cell_one_hot_df], axis=1)
# Group by 'DATE OCC' and 'Hour', and sum the one-hot encoded vectors
grouped = df2.groupby(['DATE OCC', 'Hour of Day','Months','Days of Week']).sum()
# The resulting 'grouped' DataFrame contains the aggregated one-hot vectors for each hour of each day



# Assuming 'grouped' is your DataFrame and it includes one-hot encoded columns like 'cell_0', 'cell_1', etc.

# List of one-hot encoded columns
one_hot_columns = [col for col in grouped.columns if col.startswith('cell_')]

# Convert values > 0 to 1 in one-hot encoded columns
for col in one_hot_columns:
    grouped[col] = grouped[col].apply(lambda x: 1 if x > 0 else 0)

# Now, all your one-hot encoded columns have values as either 0 or 1

grouped.reset_index(inplace=True)
grouped = grouped.drop(columns = ['DATE OCC'], index=1)

  grouped.reset_index(inplace=True)
  grouped.reset_index(inplace=True)
  grouped.reset_index(inplace=True)
  grouped.reset_index(inplace=True)


In [13]:
# Define a split point, for example, 80% for training and 20% for testing
split_point = int(len(grouped) * 0.90)

# Split the data
train_df = grouped.iloc[:split_point]
test_df = grouped.iloc[split_point:]


# Columns to scale
columns_to_scale = ['Hour of Day', 'Months', 'Days of Week']

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Fit and transform the scaler on the training data
train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])

# Transform the test data
test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[columns_to_scale] = scaler.fit_transform(train_df[columns_to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[columns_to_scale] = scaler.transform(test_df[columns_to_scale])


In [14]:
train_df

Unnamed: 0,Hour of Day,Months,Days of Week,cell_0,cell_1,cell_2,cell_3,cell_4,cell_5,cell_6,...,cell_393,cell_394,cell_395,cell_396,cell_397,cell_398,cell_399,cell_400,cell_401,cell_402
0,0.000000,0.000000,0.833333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.086957,0.000000,0.833333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.130435,0.000000,0.833333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.173913,0.000000,0.833333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.217391,0.000000,0.833333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46580,0.130435,0.363636,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46581,0.173913,0.363636,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46582,0.217391,0.363636,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46583,0.260870,0.363636,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
def create_sequences(input_data, target_columns, sequence_length):
    X = []
    y = []
    for i in range(len(input_data) - sequence_length):
        X.append(input_data.iloc[i:(i + sequence_length)].values)
        y.append(input_data.iloc[i + sequence_length][target_columns].values)
    return np.array(X), np.array(y)

# Define the sequence length
sequence_length = 24  # Example length

# Define target columns (the grid cells)
target_columns = [col for col in train_df.columns if col.startswith('cell_')]

# Create sequences and targets
X_train, y_train = create_sequences(train_df, target_columns, sequence_length)
X_test, y_test = create_sequences(test_df, target_columns, sequence_length)

In [16]:
X_train.shape

(46560, 24, 406)

In [17]:
def weighted_binary_crossentropy(y_true, y_pred, weight_positive=50.0, weight_negative=1.0):
    """
    Weighted Binary Cross-Entropy Loss for multi-label classification,
    with more penalty for false negatives than false positives.

    :param y_true: True labels
    :param y_pred: Predicted labels
    :param weight_positive: Weight for loss when y_true is 1
    :param weight_negative: Weight for loss when y_true is 0
    :return: Loss value
    """
    # Clip to prevent NaN's and Inf's
    y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1 - tf.keras.backend.epsilon())
    
    # Calculate weighted loss
    loss = -weight_positive * y_true * tf.math.log(y_pred) - weight_negative * (1 - y_true) * tf.math.log(1 - y_pred)
    
    # Count the number of positive cases (1's) in y_true
    num_positives = tf.reduce_sum(y_true, axis=-1)
    
    # Avoid division by zero
    num_positives = tf.where(tf.equal(num_positives, 0), tf.ones_like(num_positives), num_positives)
    
    # Sum the losses and divide by the number of positives
    return tf.reduce_sum(loss, axis=-1) / num_positives

In [18]:
n_features = X_train.shape[2]
model = tf.keras.models.Sequential([

    tf.keras.layers.LSTM(150,return_sequences=True, input_shape=(sequence_length, n_features)),
    tf.keras.layers.LSTM(50,return_sequences=False),
    tf.keras.layers.Dense(units=y_train.shape[1], activation='sigmoid')  # Second RNN layer (you can use LSTM or GRU here as well)
])

In [19]:
# Compile & Fit Model

epochs=30
batch_size=1

# Optimizers
learning_rate = 0.01
beta1 = 0.9
beta2 = 0.999
epsilon = 1e-7  # Small constant to avoid division by zero

# Common loss Functions
loss_1 = 'mse'
loss_2 = 'mae'
loss_3 = 'binary_crossentropy'
loss_4 = 'categorical_crossentropy'
loss_5 = 'sparse_categorical_crossentropy'

# Custom optimizer settings:
optimizer_1 = tf.keras.optimizers.Adam(learning_rate = learning_rate)
optimizer_2 = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta1, beta_2=beta2, epsilon=epsilon)

# Compile the model
model.compile(
    optimizer = optimizer_1,
    #loss=lambda y_true, y_pred:weighted_binary_crossentropy(y_true, y_pred, weight_positive=1.5, weight_negative=3.0),
    #loss = focal_loss(gamma=2., alpha=4.),
    loss = weighted_binary_crossentropy,
    metrics = ['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

KeyboardInterrupt: 

In [21]:
threshold = 0.6
predictions = model.predict(X_test)
predicted_directions = (predictions > threshold).astype(int) # Boolean which says our predicted directions
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0
# Compare the model's predictions with the actual directions in the test data
actual_directions = y_test #The values() method returns a view object. The view object contains the values of the dictionary, as a list. The view object will reflect any changes done to the dictionary



In [22]:
l = 400
predictions[l]

array([0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 1.

In [23]:
y_test[l]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [24]:
right = 0
wrong = 0
one_count_y = 0 
one_count_p = 0
one_right = 0
one_wrong_count = 0
for j in range(len(predictions)):
    for i in range(len(predictions[j])):
        if y_test[j][i] == predictions[j][i]:
            right = right + 1
        else:
            wrong = wrong + 1
        
        if y_test[j][i] == 1:
            one_count_y = one_count_y + 1
            
        if predictions[j][i] == 1:
            one_count_p = one_count_p + 1
            
        if (predictions[j][i] == y_test[j][i]) and (y_test[j][i] == 1):
            one_right = one_right + 1
        
        if (predictions[j][i] == 1) and (y_test[j][i] == 0):
            one_wrong_count = one_wrong_count + 1

In [25]:
print("Total number of times model predicted crime event outcome correctly (Crime and No Crime):",right)
print("\nTotal number of times model predicted crime event outcome incorrectly:",wrong)
print("\nTotal number of times model predicted crime would occur in Test Set:",one_count_p)
print("\nTotal number of times crime actually occured in Test Set:", one_count_y)
print("\nNumber of times model predicted crime occuring and crime did occur:",one_right)
print("\nNumber of times model predicted crime occuring and crime did NOT occur:", one_wrong_count)

Total number of times model predicted crime event outcome correctly (Crime and No Crime): 1411368

Total number of times model predicted crime event outcome incorrectly: 664888

Total number of times model predicted crime would occur in Test Set: 722703

Total number of times crime actually occured in Test Set: 114061

Number of times model predicted crime occuring and crime did occur: 85938

Number of times model predicted crime occuring and crime did NOT occur: 636765
