# 0. Initial imports and data loading

In [None]:
import math 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
from meteostat import Point, Daily

import matplotlib.colors as colors
import matplotlib.cm as cm
import osmnx as ox
import networkx as nx

import geopandas as gpd
import shapely.geometry

pd.set_option('display.max_columns', None)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance

In [None]:
project_dir = "."

In [None]:
df_merged_reduced =  pd.read_csv(f"{project_dir}/data/merged_reduced_data.csv", index_col=0)

In [None]:
df_merged_reduced.created_date = pd.to_datetime(df_merged_reduced.created_date)

# 1. Modeling

In [None]:
le = LabelEncoder()
df_merged_reduced['borough_encoded'] = le.fit_transform(df_merged_reduced['borough'].astype(str))
df_merged_reduced['location_type_encoded'] = le.fit_transform(df_merged_reduced['location_type'].astype(str))

In [None]:
features = [
  "month",
  "day_of_week",
  "hour",
  "latitude",
  "longitude",
  "location_type_encoded",
  "borough_encoded",
  "tavg",
  "prcp",
  "snow"
]

y = df_merged_reduced.complaint_type
X = df_merged_reduced[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 1.1 Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=15, min_samples_leaf=10, n_jobs=2)
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
importances = rf.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': features, 'Gini Importance': importances}).sort_values(
    'Gini Importance', ascending=False)
print(feature_imp_df)

## 1.2 Hist Gradient Boosting Classifier

In [None]:
hgb = HistGradientBoostingClassifier(
        max_iter=100,
        random_state=42
    )

hgb.fit(X_train, y_train)

predictions = hgb.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
result = permutation_importance(hgb, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Organize into a DataFrame for better analysis
feature_importances = pd.Series(result.importances_mean, index=X_train.columns).sort_values(ascending=False)

print(feature_importances)