In [2]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


KeyboardInterrupt: 

In [None]:
#Step 1: Load and preprocess data 
from preprocessing import load_and_preprocess_data

df = load_and_preprocess_data()

# Filter to only include Vancouver rentals
df = df[df["city_Vancouver"] == 1].copy()

In [None]:
#Step 2: Load the regression model 
regression_model = joblib.load("models/linear_regression_model.pkl")

In [None]:
#Step 3: Predict fair prices and label data 
X_reg = df.drop(columns=["price"])
y_actual = df["price"].values

predicted_price = regression_model.predict(X_reg)

price_ratio = y_actual / predicted_price
labels = []
for ratio in price_ratio:
    if ratio < 0.9:
        labels.append("underpriced")
    elif ratio > 1.1:
        labels.append("overpriced")
    else:
        labels.append("fair")

df["label"] = labels

In [None]:
#Step 4: Train Random Forest Classifier
X = X_reg.copy()
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
#Step 5: Evaluate the classifier 
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
conf_matrix = confusion_matrix(y_test, y_pred, labels=["underpriced", "fair", "overpriced"])
sns.heatmap(conf_matrix, annot=True, fmt="d", xticklabels=["underpriced", "fair", "overpriced"], yticklabels=["underpriced", "fair", "overpriced"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Step 6: Save the classifier model 
joblib.dump(clf, "models/classifier_model.pkl")
print("Classifier saved to models/classifier_model.pkl")