In [1]:
#Basic libraries
import os
from datetime import datetime

#Python working libraries
import requests
import json
import pandas as pd

#Mongo library
from pymongo import MongoClient

#SciKitLearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#ML training libraries
import tensorflow as tf
from keras_tuner.tuners import Hyperband
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [2]:
#Function for our Hyberband training model (to be called later)
def build_model(hp):
    model = tf.keras.models.Sequential()

    # First hidden layer
    model.add(tf.keras.layers.Dense(units=hp.Int('units_layer1', min_value=32, max_value=256, step=32), 
                                    activation="relu", 
                                    input_dim=X_train_scaled.shape[1]))

    # Second hidden layer
    model.add(tf.keras.layers.Dense(units=hp.Int('units_layer2', min_value=16, max_value=128, step=16), 
                                    activation="relu"))

    # Output layer
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Check the structure of the model
    model.summary()

    # Compile the model with a searchable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [4]:
#Connect to mongo
client = MongoClient('localhost', 27017)

# Select your database
db = client['nyc']

# Select your collection
collection = db['rats']

In [5]:
#Clean up empty and invalid data in mongo
fields_to_check = [
    "INSPECTION_TYPE",
    "ZIP_CODE",
    "BOROUGH",
    "INSPECTION_DATE",
    "RESULT"
]

query = {"$or": []}
for field in fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}}
    ])
query["$or"].append({"ZIP_CODE": 0})

coord_fields_to_check = ["LATITUDE", 'LONGITUDE']

for field in coord_fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}},
        {field: 0}
    ])

result = collection.delete_many(query)

print(f"Deleted {result.deleted_count} documents.")

Deleted 0 documents.


In [6]:
#Remove all documents with an Inspection Date before 2023
threshold_date = datetime(2023, 1, 1)

result = collection.delete_many({
    "INSPECTION_DATE": {
        "$lt": threshold_date
    }
})

print(f"{result.deleted_count} documents were deleted.")

0 documents were deleted.


In [7]:
#Bring data in from Mongo to Pandas
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

#Drop _id column
df.drop(columns=['_id'], inplace=True) 

#View df to check everything looks good
df

Unnamed: 0,INSPECTION_TYPE,ZIP_CODE,LATITUDE,LONGITUDE,BOROUGH,INSPECTION_DATE,RESULT
0,Initial,12345,40.817678,-73.941974,Manhattan,2023-03-08 15:21:41,Passed
1,Initial,11377,40.738373,-73.906470,Queens,2023-06-05 20:19:22,Passed
2,Initial,10457,40.850038,-73.894424,Bronx,2023-07-17 16:05:21,Passed
3,BAIT,11385,40.708495,-73.919696,Queens,2023-04-20 17:18:23,Bait applied
4,Initial,10470,40.897316,-73.863219,Bronx,2023-03-14 13:40:50,Failed for Other R
...,...,...,...,...,...,...,...
162372,Initial,10065,40.764009,-73.966893,Manhattan,2023-02-03 16:55:09,Passed
162373,Initial,10458,40.856980,-73.886359,Bronx,2023-02-03 20:30:05,Passed
162374,Compliance,11211,40.707009,-73.951506,Brooklyn,2023-05-26 17:10:32,Rat Activity
162375,Initial,11206,40.694630,-73.935954,Brooklyn,2023-02-03 21:00:12,Rat Activity


In [8]:
#RAT ACTIVITY IS THE TARGET OF OUR ML MODEL
# Create a new column "Rat Activity" and initialize with 0
df['RAT_ACTIVITY'] = 0

# Set the "Rat_Activity" column to 1 where there is rat activity
df.loc[(df['INSPECTION_TYPE'] == 'Initial') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[(df['INSPECTION_TYPE'] == 'Compliance') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[df['INSPECTION_TYPE'].isin(['BAIT', 'STOPPAGE', 'CLEAN_UPS']), 'RAT_ACTIVITY'] = 1

#view df again to make sure RAT_ACTIVITY was properly created and populated
df

Unnamed: 0,INSPECTION_TYPE,ZIP_CODE,LATITUDE,LONGITUDE,BOROUGH,INSPECTION_DATE,RESULT,RAT_ACTIVITY
0,Initial,12345,40.817678,-73.941974,Manhattan,2023-03-08 15:21:41,Passed,0
1,Initial,11377,40.738373,-73.906470,Queens,2023-06-05 20:19:22,Passed,0
2,Initial,10457,40.850038,-73.894424,Bronx,2023-07-17 16:05:21,Passed,0
3,BAIT,11385,40.708495,-73.919696,Queens,2023-04-20 17:18:23,Bait applied,1
4,Initial,10470,40.897316,-73.863219,Bronx,2023-03-14 13:40:50,Failed for Other R,0
...,...,...,...,...,...,...,...,...
162372,Initial,10065,40.764009,-73.966893,Manhattan,2023-02-03 16:55:09,Passed,0
162373,Initial,10458,40.856980,-73.886359,Bronx,2023-02-03 20:30:05,Passed,0
162374,Compliance,11211,40.707009,-73.951506,Brooklyn,2023-05-26 17:10:32,Rat Activity,1
162375,Initial,11206,40.694630,-73.935954,Brooklyn,2023-02-03 21:00:12,Rat Activity,1


In [9]:
#Data engineering features

# Convert the 'INSPECTION_DATE' column to datetime format
df['INSPECTION_DATE'] = pd.to_datetime(df['INSPECTION_DATE'])

# Extract the month from the 'INSPECTION_DATE' column
df['INSPECTION_MONTH'] = df['INSPECTION_DATE'].dt.month

#mean encode zip code
mean_rat_activity = df.groupby('ZIP_CODE')['RAT_ACTIVITY'].mean()

# Create a dictionary mapping each ZIP code to its corresponding mean
zip_code_to_mean = dict(mean_rat_activity)

# Replace the values in the ZIP_CODE column with the values from the dictionary
df['ZIP_CODE_ENCODED'] = df['ZIP_CODE'].map(zip_code_to_mean)

lat_long_scaler = MinMaxScaler()
df[['LATITUDE', 'LONGITUDE']] = lat_long_scaler.fit_transform(df[['LATITUDE', 'LONGITUDE']])

In [10]:
#create new df for our machine learning data model, drop cols we dont need at this point
df_ml = df.drop(columns=['ZIP_CODE', 'INSPECTION_DATE', 'RESULT'])

In [11]:
# One-hot encode the INSPECTION_TYPE and BOROUGH columns
df_ml = pd.get_dummies(df_ml, columns=['BOROUGH', 'INSPECTION_TYPE'], drop_first=True)

# Display the resulting dataframe
df_ml.head()

Unnamed: 0,LATITUDE,LONGITUDE,RAT_ACTIVITY,INSPECTION_MONTH,ZIP_CODE_ENCODED,BOROUGH_Brooklyn,BOROUGH_Manhattan,BOROUGH_Queens,BOROUGH_Staten Island,INSPECTION_TYPE_CLEAN_UPS,INSPECTION_TYPE_Compliance,INSPECTION_TYPE_Initial,INSPECTION_TYPE_STOPPAGE
0,0.771314,0.568308,0,3,0.333333,0,1,0,0,0,0,1,0
1,0.578179,0.634004,0,6,0.583062,0,0,1,0,0,0,1,0
2,0.85012,0.656294,0,7,0.421042,0,0,0,0,0,0,1,0
3,0.505417,0.609532,1,4,0.632289,0,0,1,0,0,0,0,0
4,0.965257,0.714036,0,3,0.05311,0,0,0,0,0,0,1,0


In [12]:
# Split our preprocessed data into our features and target arrays
y = df_ml["RAT_ACTIVITY"].values
X = df_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    #directory='results/nyc_dataset',
    directory='results/nyc_dataset1',
    project_name='rat_optimizer'
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                416       
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 961
Trainable params: 961
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Tuner to predict our best model
tuner.search(X_train_scaled, y_train, epochs=150, validation_data=(X_test_scaled, y_test))

Trial 145 Complete [00h 03m 22s]
val_accuracy: 0.8051484227180481

Best val_accuracy So Far: 0.8063554763793945
Total elapsed time: 01h 20m 54s

Search: Running Trial #146

Value             |Best Value So Far |Hyperparameter
160               |224               |units_layer1
80                |96                |units_layer2
0.001             |0.001             |learning_rate
67                |67                |tuner/epochs
23                |23                |tuner/initial_epoch
4                 |4                 |tuner/bracket
3                 |3                 |tuner/round
0135              |0137              |tuner/trial_id

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 160)               2080      
                                                                 
 dense_1 (Dense)             (None, 80)                12880     
     

Best Model Data:

Trial 206 Complete [00h 04m 24s]  
val_accuracy: 0.8061091303825378

Best val_accuracy So Far: 0.808276891708374  
Total elapsed time: 03h 08m 35s  

Search: Running Trial #207  

Value             |Best Value So Far |Hyperparameter  
64                |160               |units_layer1  
96                |96                |units_layer2  
0.001             |0.001             |learning_rate  
67                |200               |tuner/epochs  
23                |67                |tuner/initial_epoch  
3                 |4                 |tuner/bracket  
2                 |4                 |tuner/round  
0189              |0142              |tuner/trial_id  

In [None]:
# Compile the model (you can directly use the values from the best hyperparameters)
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

best_model = build_model(best_hyperparameters)

best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_hyperparameters.get('learning_rate')),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=150,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])

In [None]:
# Extracting the best hyperparameters
best_units_layer1 = best_hyperparameters.get('units_layer1')
best_units_layer2 = best_hyperparameters.get('units_layer2')
best_learning_rate = best_hyperparameters.get('learning_rate')

# Displaying them
print(f"Best number of units for first hidden layer: {best_units_layer1}")
print(f"Best number of units for second hidden layer: {best_units_layer2}")
print(f"Best learning rate: {best_learning_rate}")

In [None]:
# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [None]:
# Save the best_model
#best_model.save('model/nyc_rat_model')
best_model.save('model/nyc_rat_model1')

<font size="5">This is the end of our model building code. The code after this is testing new data using our model. To use this code, new csv rat data needs to be loaded into a new collection (rats_ml) in the nyc db in mongo</font>

In [None]:
#Connect to mongo
client = MongoClient('localhost', 27017)

# Select your database
db = client['nyc']

# Select your collection
collection = db['rats_ml']

In [None]:
fields_to_check = [
    "INSPECTION_TYPE",
    "ZIP_CODE",
    "BOROUGH",
    "INSPECTION_DATE",
    "RESULT"
]

query = {"$or": []}
for field in fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}}
    ])
query["$or"].append({"ZIP_CODE": 0})

coord_fields_to_check = ["LATITUDE", 'LONGITUDE']

for field in coord_fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}},
        {field: 0}
    ])

result = collection.delete_many(query)

print(f"Deleted {result.deleted_count} documents.")

In [None]:
#Remove all documents prior to Aug 17 2023 (date of test data)
# Define the date threshold
threshold_date = datetime(2023, 8, 17)

# Remove documents 
result = collection.delete_many({
    "INSPECTION_DATE": {
        "$lt": threshold_date
    }
})

print(f"{result.deleted_count} documents were deleted.")

In [None]:
#Bring data in from Mongo to Pandas
cursor = collection.find({})
df2 = pd.DataFrame(list(cursor))

#Drop _id column
df2.drop(columns=['_id'], inplace=True) 

In [None]:
#data engineer dataset
# Convert the 'INSPECTION_DATE' column to datetime format
df2['INSPECTION_DATE'] = pd.to_datetime(df2['INSPECTION_DATE'])

# Extract the month from the 'INSPECTION_DATE' column
df2['INSPECTION_MONTH'] = df2['INSPECTION_DATE'].dt.month

# Use the zip_code_to_mean dictionary to encode the ZIP_CODE column in df2
df2['ZIP_CODE_ENCODED'] = df2['ZIP_CODE'].map(zip_code_to_mean)
df2['ZIP_CODE_ENCODED'].fillna(df2['ZIP_CODE_ENCODED'].mean(), inplace=True)

# Transform the LATITUDE and LONGITUDE columns using the scaler that was fitted on the original data
df2[['LATITUDE', 'LONGITUDE']] = lat_long_scaler.transform(df2[['LATITUDE', 'LONGITUDE']])


In [None]:
#create new df for our machine learning data model, drop cols we dont need at this point
df_ml2 = df2.drop(columns=['ZIP_CODE', 'INSPECTION_DATE', 'RESULT'])

In [None]:
# One-hot encode the INSPECTION_TYPE and BOROUGH columns
df_ml2 = pd.get_dummies(df_ml2, columns=['BOROUGH', 'INSPECTION_TYPE'], drop_first=True)

# Identify the missing columns in df_ml2
missing_cols = set(df_ml.columns) - set(df_ml2.columns)

# Remove the target column from missing columns (if it's present)
missing_cols.discard("RAT_ACTIVITY")

# Add the missing columns to df_ml2 with default value of 0
for col in missing_cols:
    df_ml2[col] = 0

# To ensure the column order is consistent with df_ml
df_ml2 = df_ml2[df_ml.columns.drop("RAT_ACTIVITY")]

In [None]:
#check dataframe
df_ml2

In [None]:
# Using the model to predict
predicted_rat_activity = best_model.predict(df_ml2)

# Convert predictions to a DataFrame (if needed)
predicted_df = pd.DataFrame(predicted_rat_activity, columns=['Predicted_RAT_ACTIVITY'])


In [None]:
predicted_class_labels = (predicted_rat_activity > 0.5).astype(int)

In [None]:
df2['Predicted_RAT_ACTIVITY'] = predicted_class_labels


In [None]:
df2