In [1]:
#Basic libraries
import os
from datetime import datetime

#python working libraries
import requests
import json
import pandas as pd

#mongo library
from pymongo import MongoClient

#ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#ml training libraries
import tensorflow as tf
from keras_tuner.tuners import Hyperband
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [2]:
#Function for our Hyberband training model (to be called later)
def build_model(hp):
    model = tf.keras.models.Sequential()

    # First hidden layer
    model.add(tf.keras.layers.Dense(units=hp.Int('units_layer1', min_value=32, max_value=256, step=32), 
                                    activation="relu", 
                                    input_dim=X_train_scaled.shape[1]))

    # Second hidden layer
    model.add(tf.keras.layers.Dense(units=hp.Int('units_layer2', min_value=16, max_value=128, step=16), 
                                    activation="relu"))

    # Output layer
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Check the structure of the model
    model.summary()

    # Compile the model with a searchable learning rate
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [3]:
#connecting to mongo
client = MongoClient('localhost', 27017)

# Select your database
db = client['nyc']

# Select your collection
collection = db['rats']

In [4]:
#Clean up empty and invalid data in mongo
fields_to_check = [
    "INSPECTION_TYPE",
    "ZIP_CODE",
    "BOROUGH",
    "INSPECTION_DATE",
    "RESULT"
]

query = {"$or": []}
for field in fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}}
    ])
query["$or"].append({"ZIP_CODE": 0})

coord_fields_to_check = ["LATITUDE", 'LONGITUDE']

for field in coord_fields_to_check:
    query["$or"].extend([
        {field: {"$exists": False}},
        {field: ""},
        {field: {"$regex": "^\s*$"}},
        {field: 0}
    ])

result = collection.delete_many(query)

print(f"Deleted {result.deleted_count} documents.")

Deleted 0 documents.


In [5]:
#Remove all documents with an Inspection Date before 2023
threshold_date = datetime(2023, 1, 1)

result = collection.delete_many({
    "INSPECTION_DATE": {
        "$lt": threshold_date
    }
})

print(f"{result.deleted_count} documents were deleted.")

0 documents were deleted.


In [7]:
#Bring data in from Mongo to Pandas
cursor = collection.find({})
df = pd.DataFrame(list(cursor))

#Drop _id column
df.drop(columns=['_id'], inplace=True) 

#View df to check everything looks good
df

Unnamed: 0,INSPECTION_TYPE,ZIP_CODE,LATITUDE,LONGITUDE,BOROUGH,INSPECTION_DATE,RESULT
0,Initial,12345,40.817678,-73.941974,Manhattan,2023-03-08 15:21:41,Passed
1,Initial,11377,40.738373,-73.906470,Queens,2023-06-05 20:19:22,Passed
2,Initial,10457,40.850038,-73.894424,Bronx,2023-07-17 16:05:21,Passed
3,BAIT,11385,40.708495,-73.919696,Queens,2023-04-20 17:18:23,Bait applied
4,Initial,10470,40.897316,-73.863219,Bronx,2023-03-14 13:40:50,Failed for Other R
...,...,...,...,...,...,...,...
162372,Initial,10065,40.764009,-73.966893,Manhattan,2023-02-03 16:55:09,Passed
162373,Initial,10458,40.856980,-73.886359,Bronx,2023-02-03 20:30:05,Passed
162374,Compliance,11211,40.707009,-73.951506,Brooklyn,2023-05-26 17:10:32,Rat Activity
162375,Initial,11206,40.694630,-73.935954,Brooklyn,2023-02-03 21:00:12,Rat Activity


In [None]:
#RAT ACTIVITY IS THE TARGET OF OUR ML MODEL
# Create a new column "Rat Activity" and initialize with 0
df['RAT_ACTIVITY'] = 0

# Set the "Rat_Activity" column to 1 where there is rat activity
df.loc[(df['INSPECTION_TYPE'] == 'Initial') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[(df['INSPECTION_TYPE'] == 'Compliance') & (df['RESULT'] == 'Rat Activity'), 'RAT_ACTIVITY'] = 1
df.loc[df['INSPECTION_TYPE'].isin(['BAIT', 'STOPPAGE', 'CLEAN_UPS']), 'RAT_ACTIVITY'] = 1

#view df again to make sure RAT_ACTIVITY was properly created and populated
df

In [None]:
#Data engineering features

# Convert the 'INSPECTION_DATE' column to datetime format
df['INSPECTION_DATE'] = pd.to_datetime(df['INSPECTION_DATE'])

# Extract the month from the 'INSPECTION_DATE' column
df['INSPECTION_MONTH'] = df['INSPECTION_DATE'].dt.month

#mean encode zip code
mean_rat_activity = df.groupby('ZIP_CODE')['RAT_ACTIVITY'].mean()

# Create a dictionary mapping each ZIP code to its corresponding mean
zip_code_to_mean = dict(mean_rat_activity)

# Replace the values in the ZIP_CODE column with the values from the dictionary
df['ZIP_CODE_ENCODED'] = df['ZIP_CODE'].map(zip_code_to_mean)

scaler = MinMaxScaler()
df[['LATITUDE', 'LONGITUDE']] = scaler.fit_transform(df[['LATITUDE', 'LONGITUDE']])

In [None]:
#create new df for our machine learning data model, drop cols we dont need at this point
df_ml = df.drop(columns=['ZIP_CODE', 'INSPECTION_DATE', 'RESULT'])

In [None]:
# One-hot encode the INSPECTION_TYPE and BOROUGH columns
df_ml = pd.get_dummies(df_ml, columns=['BOROUGH', 'INSPECTION_TYPE'], drop_first=True)

# Display the resulting dataframe
df_ml.head()

In [None]:
# Split our preprocessed data into our features and target arrays
y = df_ml["RAT_ACTIVITY"].values
X = df_ml.drop(columns=["RAT_ACTIVITY"]).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=200,
    directory='results/nyc_dataset',
    project_name='rat_optimizer'
)

In [None]:
#Tuner to predict our best model
tuner.search(X_train_scaled, y_train, epochs=150, validation_data=(X_test_scaled, y_test))

Best Model Data:



In [None]:
# Compile the model (you can directly use the values from the best hyperparameters)
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

best_model = build_model(best_hyperparameters)

best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_hyperparameters.get('learning_rate')),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Create an EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# Train the model
history = best_model.fit(X_train_scaled,
                         y_train,
                         epochs=100,
                         batch_size=32,
                         validation_data=(X_test_scaled, y_test),
                         callbacks=[early_stopping])

In [None]:
# Evaluate the model
loss, accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [None]:
# Save the best_model
best_model.save('model/nyc_rat_model')

In [None]:
# Generate predictions on the test set
predictions = best_model.predict(X_test_scaled)

# If you want binary predictions (0 or 1) since you're using binary_crossentropy
binary_predictions = (predictions > 0.5).astype("int32")

In [None]:
#load the saved model
loaded_model = load_model('model/nyc_rat_model')

#at this point we need the new data to test it against. lets set up a new JN with that code along with preprocessing
new_predictions = loaded_model.predict(new_data_scaled)

# If you want binary predictions (0 or 1) for your new data
binary_new_predictions = (new_predictions > 0.5).astype("int32")

# Print a few predictions from the new data
for i in range(10):  # adjust the range based on the size of your new data
    print(f"Predicted Probability: {new_predictions[i]}, Binary Prediction: {binary_new_predictions[i]}")