In [1]:
import requests
import json
import os
from pprint import pprint
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

In [2]:
from keras_tuner.tuners import Hyperband
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
client = MongoClient('localhost', 27017)

# Select your database
db = client['nyc']

# Select your collection
collection = db['rats']

In [4]:
#Clean out empty data
# Define the fields that should be present and non-empty in every document
fields_to_check = [
    "INSPECTION_TYPE",
    "ZIP_CODE",
    "BOROUGH",
    "INSPECTION_DATE",
    "RESULT"
]

# Construct the query to match documents where any of the specified fields is missing or has an empty/whitespace value
query = {"$or": []}
for field in fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}}
    ])
query["$or"].append({"ZIP_CODE": 0})

# Delete documents matching the query
result = collection.delete_many(query)

# Print the result of the delete operation
print(f"Deleted {result.deleted_count} documents.")

Deleted 0 documents.


In [5]:
#Remove all documents with an Inspection Date before 2023
# Define the date threshold
threshold_date = datetime(2023, 1, 1)
#threshold_date = datetime(2022, 1, 1)

# Remove documents with an INSPECTION_DATE before 2023
result = collection.delete_many({
    "INSPECTION_DATE": {
        "$lt": threshold_date
    }
})

print(f"{result.deleted_count} documents were deleted.")


0 documents were deleted.


In [6]:
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

In [7]:
df.drop(columns=['_id'], inplace=True) 

In [8]:
#RAT ACTIVITY IS THE TARGET OF OUR ML MODEL
# Create a new column "Rat Activity" and initialize with 0
df['RAT_ACTIVITY'] = 0

# Set the "Rat_Activity" column to 1 where there is rat activity
df.loc[(df['INSPECTION_TYPE'] == 'Initial') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[(df['INSPECTION_TYPE'] == 'Compliance') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[df['INSPECTION_TYPE'].isin(['BAIT', 'STOPPAGE', 'CLEAN_UPS']), 'RAT_ACTIVITY'] = 1

df


Unnamed: 0,INSPECTION_TYPE,ZIP_CODE,BOROUGH,INSPECTION_DATE,RESULT,RAT_ACTIVITY
0,Initial,10469,Bronx,2023-03-10 20:10:27,Passed,0
1,Initial,10029,Manhattan,2023-03-24 12:30:00,Rat Activity,1
2,Initial,10027,Manhattan,2023-01-20 19:31:22,Passed,0
3,Initial,11221,Brooklyn,2023-05-12 18:22:44,Passed,0
4,Initial,10451,Bronx,2023-01-19 21:08:39,Passed,0
...,...,...,...,...,...,...
165367,Initial,10065,Manhattan,2023-02-03 16:55:09,Passed,0
165368,Initial,10458,Bronx,2023-02-03 20:30:05,Passed,0
165369,Compliance,11211,Brooklyn,2023-05-26 17:10:32,Rat Activity,1
165370,Initial,11206,Brooklyn,2023-02-03 21:00:12,Rat Activity,1


In [9]:
#Data engineering features
# Convert the 'INSPECTION_DATE' column to datetime format
df['INSPECTION_DATE'] = pd.to_datetime(df['INSPECTION_DATE'])

# Extract the month from the 'INSPECTION_DATE' column
df['INSPECTION_MONTH'] = df['INSPECTION_DATE'].dt.month

# Print the updated dataframe
print(df[['INSPECTION_DATE', 'INSPECTION_MONTH']])

           INSPECTION_DATE  INSPECTION_MONTH
0      2023-03-10 20:10:27                 3
1      2023-03-24 12:30:00                 3
2      2023-01-20 19:31:22                 1
3      2023-05-12 18:22:44                 5
4      2023-01-19 21:08:39                 1
...                    ...               ...
165367 2023-02-03 16:55:09                 2
165368 2023-02-03 20:30:05                 2
165369 2023-05-26 17:10:32                 5
165370 2023-02-03 21:00:12                 2
165371 2023-03-02 18:51:42                 3

[165372 rows x 2 columns]


In [10]:
# Group the data by ZIP code and compute the mean of RAT_ACTIVITY
mean_rat_activity = df.groupby('ZIP_CODE')['RAT_ACTIVITY'].mean()

# Create a dictionary mapping each ZIP code to its corresponding mean
zip_code_to_mean = dict(mean_rat_activity)

# Replace the values in the ZIP_CODE column with the values from the dictionary
df['ZIP_CODE_ENCODED'] = df['ZIP_CODE'].map(zip_code_to_mean)


In [11]:
df_ml = df.drop(columns=['ZIP_CODE', 'RESULT', 'INSPECTION_DATE'])

In [12]:
#drop borough to see if you get better results
#df_ml = df_ml.drop(columns=['BOROUGH'])

In [13]:
df_ml

Unnamed: 0,INSPECTION_TYPE,BOROUGH,RAT_ACTIVITY,INSPECTION_MONTH,ZIP_CODE_ENCODED
0,Initial,Bronx,0,3,0.021053
1,Initial,Manhattan,1,3,0.637914
2,Initial,Manhattan,0,1,0.470306
3,Initial,Brooklyn,0,5,0.383615
4,Initial,Bronx,0,1,0.503724
...,...,...,...,...,...
165367,Initial,Manhattan,0,2,0.593220
165368,Initial,Bronx,0,2,0.415266
165369,Compliance,Brooklyn,1,5,0.616140
165370,Initial,Brooklyn,1,2,0.426430


In [14]:
# One-hot encode the INSPECTION_TYPE column
df_ml = pd.get_dummies(df_ml, columns=['INSPECTION_TYPE', 'BOROUGH'], drop_first=True)
#df_ml = pd.get_dummies(df_ml, columns=['INSPECTION_TYPE'], drop_first=True)

# Display the resulting dataframe
df_ml.head()

Unnamed: 0,RAT_ACTIVITY,INSPECTION_MONTH,ZIP_CODE_ENCODED,INSPECTION_TYPE_CLEAN_UPS,INSPECTION_TYPE_Compliance,INSPECTION_TYPE_Initial,INSPECTION_TYPE_STOPPAGE,BOROUGH_Brooklyn,BOROUGH_Manhattan,BOROUGH_Queens,BOROUGH_Staten Island
0,0,3,0.021053,0,0,1,0,0,0,0,0
1,1,3,0.637914,0,0,1,0,0,1,0,0
2,0,1,0.470306,0,0,1,0,0,1,0,0
3,0,5,0.383615,0,0,1,0,1,0,0,0
4,0,1,0.503724,0,0,1,0,0,0,0,0


In [15]:
#lets add some hyberband!
def build_model(hp):
    model = tf.keras.models.Sequential()

    # First hidden layer
    model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=X_train_scaled.shape[1]))

    # Second hidden layer
    model.add(tf.keras.layers.Dense(units=8, activation="relu"))

    # Output layer
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Check the structure of the model
    model.summary()

    # Compile the model with a searchable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


In [16]:
# Split our preprocessed data into our features and target arrays
y = df_ml["RAT_ACTIVITY"].values
X = df_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
print(X_train_scaled.shape)

(124029, 10)


In [19]:
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    directory='results/nyc_dataset',
    project_name='rat_optimizer'
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                176       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 321
Trainable params: 321
Non-trainable params: 0
_________________________________________________________________


In [20]:
tuner.search(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test))


Trial 3 Complete [00h 00m 17s]
val_accuracy: 0.7990469932556152

Best val_accuracy So Far: 0.7993856072425842
Total elapsed time: 00h 00m 51s
INFO:tensorflow:Oracle triggered exit


In [21]:
best_model = tuner.get_best_models(num_models=1)[0]



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                176       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 321
Trainable params: 321
Non-trainable params: 0
_________________________________________________________________


In [22]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

In [23]:
best_lr = best_hyperparameters.get('learning_rate')

In [24]:
# Compile the model
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=100,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


In [25]:
# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {loss}, Accuracy: {accuracy}")

1292/1292 - 1s - loss: 0.4155 - accuracy: 0.8001 - 1s/epoch - 835us/step
Loss: 0.41546982526779175, Accuracy: 0.800062894821167


That was the test of the nn on the full data set. Now we will train it on the boroughs to see how it does there. We will look at Brooklyn, Bronx and Manhattan (highest incidences of rats)

In [None]:
# Filter the DataFrame into separate DataFrames for each borough
staten_island = df[df['BOROUGH'] == 'Staten Island']
bronx = df[df['BOROUGH'] == 'Bronx']
brooklyn = df[df['BOROUGH'] == 'Brooklyn']
manhattan = df[df['BOROUGH'] == 'Manhattan']
queens = df[df['BOROUGH'] == 'Queens']


In [None]:
def preprocess_borough_data(borough_data):
    # Drop the unnecessary columns
    borough_data.drop(columns=['INSPECTION_DATE', 'RESULT', 'BOROUGH'], inplace=True)

    # One-hot encode the INSPECTION_TYPE column
    borough_data = pd.get_dummies(borough_data, columns=['INSPECTION_TYPE'], drop_first=True)

    # Drop the ZIP_CODE column (since we already have the encoded version)
    borough_data.drop(columns=['ZIP_CODE'], inplace=True)
    
    return borough_data

# Preprocess the data for each borough
staten_island_ml = preprocess_borough_data(staten_island)
bronx_ml = preprocess_borough_data(bronx)
brooklyn_ml = preprocess_borough_data(brooklyn)
manhattan_ml = preprocess_borough_data(manhattan)
queens_ml = preprocess_borough_data(queens)

BROOKLYN DATA TEST

In [None]:
# Brooklyn
# Split our preprocessed data into our features and target arrays
y = brooklyn_ml["RAT_ACTIVITY"].values
X = brooklyn_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    directory='results/hyperband_results_brooklyn',
    project_name='rat_optimizer'
)

tuner.search(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test))

best_model = tuner.get_best_models(num_models=1)[0]

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

best_lr = best_hyperparameters.get('learning_rate')

# Compile the model
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=40,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Brooklyn: Loss: {loss}, Accuracy: {accuracy}")


BRONX DATA TEST

In [None]:
# Bronx
# Split our preprocessed data into our features and target arrays
y = bronx_ml["RAT_ACTIVITY"].values
X = bronx_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    directory='results/hyperband_results_bronx',
    project_name='rat_optimizer'
)

tuner.search(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test))

best_model = tuner.get_best_models(num_models=1)[0]

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

best_lr = best_hyperparameters.get('learning_rate')

# Compile the model
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=40,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Bronx: Loss: {loss}, Accuracy: {accuracy}")


MANHATTAN DATA TEST

In [None]:
# Split our preprocessed data into our features and target arrays
y = manhattan_ml["RAT_ACTIVITY"].values
X = manhattan_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    directory='results/hyperband_results_manhattan',
    project_name='rat_optimizer'
)

tuner.search(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test))

best_model = tuner.get_best_models(num_models=1)[0]

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

best_lr = best_hyperparameters.get('learning_rate')

# Compile the model
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=40,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Manhattan: Loss: {loss}, Accuracy: {accuracy}")


Testing Notes:

Base test: 16/8/1/50 epochs
Full Dataset: 1292/1292 - 1s - loss: 0.4138 - accuracy: 0.8003 - 858ms/epoch - 664us/step
Loss: 0.4137950539588928, Accuracy: 0.8003047704696655

Brooklyn: 385/385 - 0s - loss: 0.4390 - accuracy: 0.7908 - 273ms/epoch - 708us/step
Loss: 0.4390130043029785, Accuracy: 0.7908262610435486

Bronx: 374/374 - 0s - loss: 0.4202 - accuracy: 0.7940 - 264ms/epoch - 707us/step
Loss: 0.4202301502227783, Accuracy: 0.7940191030502319

Manhattan: 429/429 - 0s - loss: 0.3873 - accuracy: 0.8155 - 338ms/epoch - 788us/step
Loss: 0.38726359605789185, Accuracy: 0.8155113458633423

First Test: Hyperband Optimized (same data as above)
Full Dataset: 1292/1292 - 1s - loss: 0.4134 - accuracy: 0.7997 - 868ms/epoch - 672us/step
Loss: 0.41337573528289795, Accuracy: 0.7996758818626404

Brooklyn: 385/385 - 0s - loss: 0.4410 - accuracy: 0.7907 - 279ms/epoch - 726us/step
Loss: 0.4409985840320587, Accuracy: 0.7906636595726013

Bronx: 374/374 - 0s - loss: 0.4238 - accuracy: 0.7952 - 302ms/epoch - 808us/step
Loss: 0.4238165020942688, Accuracy: 0.7951918244361877

Manhattan: 429/429 - 0s - loss: 0.3951 - accuracy: 0.8146 - 294ms/epoch - 684us/step
Loss: 0.39511218667030334, Accuracy: 0.8146366477012634

Second Test: REMOVE BOROUGH
Full Dataset: 1292/1292 - 1s - loss: 0.4181 - accuracy: 0.7984 - 720ms/epoch - 557us/step
Loss: 0.4180651605129242, Accuracy: 0.7983697652816772

Brooklyn: 385/385 - 0s - loss: 0.4415 - accuracy: 0.7909 - 370ms/epoch - 960us/step
Loss: 0.44151216745376587, Accuracy: 0.790907621383667

Bronx: 385/385 - 0s - loss: 0.4390 - accuracy: 0.7909 - 225ms/epoch - 584us/step
Loss: 0.4389711618423462, Accuracy: 0.790907621383667

Manhattan: 429/429 - 0s - loss: 0.3889 - accuracy: 0.8133 - 245ms/epoch - 570us/step
Loss: 0.3889065682888031, Accuracy: 0.8133245706558228

Third Test: EXPANDED DATA (added 2022 data, kept BOROUGH out of the data)
Full Dataset: 3217/3217 - 2s - loss: 0.4453 - accuracy: 0.7800 - 2s/epoch - 539us/step
Loss: 0.4452953338623047, Accuracy: 0.7800136208534241

Brooklyn: 1073/1073 - 1s - loss: 0.4757 - accuracy: 0.7553 - 571ms/epoch - 532us/step
Loss: 0.4757363796234131, Accuracy: 0.7552751302719116

Bronx: 872/872 - 0s - loss: 0.4454 - accuracy: 0.7841 - 475ms/epoch - 544us/step
Loss: 0.445403128862381, Accuracy: 0.7840550541877747

Manhattan:
1025/1025 - 1s - loss: 0.4179 - accuracy: 0.7937 - 574ms/epoch - 560us/step
Loss: 0.41794469952583313, Accuracy: 0.7936880588531494

Fourth Test: EXPANDED DATA + BOROUGH
Full Dataset: 3217/3217 - 2s - loss: 0.4445 - accuracy: 0.7827 - 2s/epoch - 543us/step
Loss: 0.44453078508377075, Accuracy: 0.7826564311981201

Brooklyn: 1073/1073 - 1s - loss: 0.4702 - accuracy: 0.7687 - 578ms/epoch - 539us/step
Loss: 0.4702180325984955, Accuracy: 0.7687398195266724

Bronx: 872/872 - 0s - loss: 0.4450 - accuracy: 0.7841 - 488ms/epoch - 560us/step
Loss: 0.44502782821655273, Accuracy: 0.7840909361839294

Manhattan: 1025/1025 - 1s - loss: 0.4186 - accuracy: 0.7972 - 575ms/epoch - 561us/step
Loss: 0.41864851117134094, Accuracy: 0.7971947193145752

