In [None]:
import pandas as pd
import os
import numpy as np
from datetime import date
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [96]:
file_path = "data/House_Prices.csv"

In [97]:
houses = pd.read_csv(file_path)

print("Shape:", houses.shape)

Shape: (10659, 13)


Unnamed: 0.1,Unnamed: 0,Record,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Town,University,Type2
0,1,1,295000.0,42521,5,3.0,2020,38332.8,3,1976,1,10,3
1,2,2,240000.0,42541,4,2.0,1498,54014.4,3,2002,1,10,3
2,3,3,385000.0,42521,5,4.0,4000,85813.2,3,2001,1,10,3
3,4,4,268000.0,42472,3,2.5,2283,118918.8,3,1972,1,10,3
4,5,5,186000.0,42465,3,1.25,1527,15681.6,3,1975,1,10,3


## How many rows are there?
10659 rows


In [None]:
houses.head()

In [98]:
df = houses.drop(columns = ["Unnamed: 0", "Record", "University", "Type2"])
df

Unnamed: 0,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Town
0,295000.0,42521,5,3.00,2020,38332.8,3,1976,1
1,240000.0,42541,4,2.00,1498,54014.4,3,2002,1
2,385000.0,42521,5,4.00,4000,85813.2,3,2001,1
3,268000.0,42472,3,2.50,2283,118918.8,3,1972,1
4,186000.0,42465,3,1.25,1527,15681.6,3,1975,1
...,...,...,...,...,...,...,...,...,...
10654,320000.0,42528,3,2.00,1870,13068.0,3,2012,50
10655,359100.0,42468,5,4.50,2119,11325.6,3,2013,50
10656,349646.0,42534,3,1.75,1949,14374.8,3,2015,50
10657,288000.0,42476,4,4.00,2710,10890.0,3,2012,50


In [99]:
## Set features and target
X = df.drop(columns="Town")
y = df["Town"]

In [100]:
## Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
n_estimators_options = [50, 100, 200, 500, 1000]
max_depth_options = range(1, 16, 2)


In [102]:
%%time
best_accuracy = 0
best_model_info = None
best_model = None


for n_estimators in n_estimators_options:
    for max_depth in max_depth_options:
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        avg_cv_scores = cv_scores.mean()
        
        model.fit(X_train, y_train)
        test_score = model.score(X_test, y_test)
        
        train_pred = model.predict(X_train) 
        test_pred = model.predict(X_test)
        
        train_accuracy = accuracy_score(y_train, train_pred)
        test_accuracy = accuracy_score(y_test, test_pred)
        
        if test_accuracy > best_accuracy:
            best_model = model
            best_model_info = {
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "test_score": test_score,
                "cv_scores": cv_scores,
                "avg_cv_scores": avg_cv_scores,
                "train_accuracy": train_accuracy,
                "test_accuracy": test_accuracy
            }

CPU times: total: 7min 7s
Wall time: 7min 51s


In [103]:
best_model_info

{'n_estimators': 1000,
 'max_depth': 15,
 'test_score': 0.39305816135084426,
 'cv_scores': array([0.40797186, 0.41207503, 0.39120235, 0.40117302, 0.41466276]),
 'avg_cv_scores': np.float64(0.4054170032969715),
 'train_accuracy': 0.9890934678081389,
 'test_accuracy': 0.39305816135084426}

In [104]:
lee_house = pd.DataFrame({
    'Sale_amount': [350000],
    'Sale_date': [43201],
    'Beds': [3],
    'Baths': [2],
    'Sqft_home': [1450],
    'Sqft_lot': [40000],
    'Type': [3],
    'Build_year': [1992]
})

In [105]:
predicted_town = best_model.predict(lee_house)
print("Predicted Town: ", predicted_town[0])

Predicted Town:  15
