In [9]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [10]:
# load cleaned dataset
df = pd.read_csv("../data/processed/cleaned_rentals.csv")
print("Dataset loaded with shape:", df.shape)
df.head()

Dataset loaded with shape: (196, 49)


Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,price,beds,baths,sq_feet,...,availability_date_Negotiable,availability_date_No Vacancy,availability_date_November 24,availability_date_October 01,availability_date_October 04,availability_date_September 01,smoking_Non-Smoking,smoking_Smoke Free Building,cats_True,dogs_True
0,544095,Vancouver,British Columbia,1770 Pendrell Street,49.28746,-123.14053,3895.0,2,2.0,820.0,...,False,False,False,False,False,False,True,False,True,True
1,544095,Vancouver,British Columbia,1770 Pendrell Street,49.28746,-123.14053,2695.0,1,1.0,440.0,...,False,False,False,False,False,False,True,False,True,True
2,544095,Vancouver,British Columbia,1770 Pendrell Street,49.28746,-123.14053,4395.0,2,2.0,639.0,...,False,False,False,False,False,False,True,False,True,True
3,544095,Vancouver,British Columbia,1770 Pendrell Street,49.28746,-123.14053,3300.0,1,1.0,650.0,...,False,False,False,False,False,False,True,False,True,True
4,559642,Vancouver,British Columbia,1477 Continental Street,49.274461,-123.130948,2450.0,1,1.0,639.0,...,False,False,False,False,False,False,False,False,True,True


In [11]:
# Keep only the desired columns
cols_to_keep = [
    "city", "province", "address", "sq_ft", "beds", "baths", "type_Townhouse",
     "type_Basement","type_Condo Unit","type_Main Floor", "latitude", "longitude", "price"
]
available_cols = [col for col in cols_to_keep if col in df.columns]
df = df[available_cols].dropna()
print("Dataset loaded with shape:", df.shape)
df.head()


Dataset loaded with shape: (196, 12)


Unnamed: 0,city,province,address,beds,baths,type_Townhouse,type_Basement,type_Condo Unit,type_Main Floor,latitude,longitude,price
0,Vancouver,British Columbia,1770 Pendrell Street,2,2.0,False,False,False,False,49.28746,-123.14053,3895.0
1,Vancouver,British Columbia,1770 Pendrell Street,1,1.0,False,False,False,False,49.28746,-123.14053,2695.0
2,Vancouver,British Columbia,1770 Pendrell Street,2,2.0,False,False,False,False,49.28746,-123.14053,4395.0
3,Vancouver,British Columbia,1770 Pendrell Street,1,1.0,False,False,False,False,49.28746,-123.14053,3300.0
4,Vancouver,British Columbia,1477 Continental Street,1,1.0,False,False,False,False,49.274461,-123.130948,2450.0


In [12]:
#Step 2: Load the regression model 
regression_model = joblib.load("models/linear_regression_model.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'models/linear_regression_model.pkl'

In [None]:
#Step 3: Predict fair prices and label data 
X_reg = df.drop(columns=["price"])
y_actual = df["price"].values

predicted_price = regression_model.predict(X_reg)

price_ratio = y_actual / predicted_price
labels = []
for ratio in price_ratio:
    if ratio < 0.9:
        labels.append("underpriced")
    elif ratio > 1.1:
        labels.append("overpriced")
    else:
        labels.append("fair")

df["label"] = labels

In [None]:
#Step 4: Train Random Forest Classifier
X = X_reg.copy()
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
#Step 5: Evaluate the classifier 
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
conf_matrix = confusion_matrix(y_test, y_pred, labels=["underpriced", "fair", "overpriced"])
sns.heatmap(conf_matrix, annot=True, fmt="d", xticklabels=["underpriced", "fair", "overpriced"], yticklabels=["underpriced", "fair", "overpriced"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
#Step 6: Save the classifier model 
joblib.dump(clf, "models/classifier_model.pkl")
print("Classifier saved to models/classifier_model.pkl")