In [None]:
import pandas as pd
import os
import numpy as np
from datetime import date
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 1. Data Processing


In [None]:
file_path = "data/House_Prices.csv"

In [None]:
houses = pd.read_csv(file_path)

print("Shape:", houses.shape)
houses.head()


#### How many rows are there?
- There are 10659 rows in total.

#### Looks like there are some extra columns with row markers that appeared over the cleaning process.  How many "actual" columns are there?
- Without counting the "Unnamed: 0" column, there are 12 actual columns.

#### Why are the dates just numbers?  Why is this ok?
- In Excel, dates are stored as **serial numbers** so that it is easier to perform calculations with them.
- These numbers represent how many days have passed since a specific start date **(January 1, 1900)**.

In [None]:
# Dropping unused columns
df = houses.drop(columns = ["Unnamed: 0", "Record", "University", "Type2"])
df

# 2. Create Classification Models


In [None]:
## Set features and target
X = df.drop(columns="Town")
y = df["Town"]

In [None]:
## Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree Classification Model


In [None]:
def optimize_decision_tree(X_train, y_train, X_test, y_test):
    # Initialize variables
    best_model = None
    best_model_info = None
    best_accuracy = 0

    ## Explore 3 differemt hyperparameters
    min_samples_split_range = [2, 5, 10, 20]
    depths = list(range(1, 16))
    criteria = ['gini', 'entropy']

    
    # Using nested loops to try out different parameters to find the best model with lowest error
    for depth in depths:
       for min_samples_split in min_samples_split_range:
           for criterion in criteria:
                # Train the model
                model = DecisionTreeClassifier(max_depth=depth, random_state=42, min_samples_split=min_samples_split, criterion=criterion)
                
                #Cross validation scores
                cv_scores = cross_val_score(model, X_train, y_train, cv=5)
                avg_cv_scores = cv_scores.mean()
               
                #Train model
                model.fit(X_train, y_train)
                
                # Predicting on train and test set
                train_pred = model.predict(X_train)
                test_pred = model.predict(X_test)

                #Calculate accuracy
                train_accuracy = accuracy_score(y_train, train_pred)
                test_accuracy = accuracy_score(y_test, test_pred)
                
                #find best model
                if test_accuracy > best_accuracy:
                    best_accuracy = test_accuracy
                    best_model = model
                    best_model_info = {
                        "min_samples_split": min_samples_split,
                        "depth": depth,
                        "criteria": criterion,
                        "cv_scores": cv_scores,
                        "avg_cv_scores": avg_cv_scores,
                        "train_accuracy": train_accuracy,
                        "test_accuracy": test_accuracy
                        
                    }
    
    # Return the best model and its info 
    return best_model, best_model_info

### Note:
- I took this code from our previous assignment and adjusted it a little bit in order to find the best possible Decision Tree model

# Random Forest Classification Model


In [None]:
def optimize_rf(X_train, y_train, X_test, y_test):
    # initialize variables
    n_estimators_options = [50, 100, 200, 500]
    max_depth_options = range(1, 16, 2)

    best_accuracy = 0
    best_model_info = None
    best_model = None
    
    # Try out different params to find best model
    for n_estimators in n_estimators_options:
        for max_depth in max_depth_options:
            # Create model
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            
            # Cross validation score
            cv_scores = cross_val_score(model, X_train, y_train, cv=5)
            avg_cv_scores = cv_scores.mean()
            
            #Train model
            model.fit(X_train, y_train)
            test_score = model.score(X_test, y_test)
            
            #Predicting on train and test sets
            train_pred = model.predict(X_train) 
            test_pred = model.predict(X_test)
            
            #Calculate accuracy
            train_accuracy = accuracy_score(y_train, train_pred)
            test_accuracy = accuracy_score(y_test, test_pred)
            
            # Find best model
            if test_accuracy > best_accuracy:
                best_model = model
                best_model_info = {
                    "n_estimators": n_estimators,
                    "max_depth": max_depth,
                    "test_score": test_score,
                    "cv_scores": cv_scores,
                    "avg_cv_scores": avg_cv_scores,
                    "train_accuracy": train_accuracy,
                    "test_accuracy": test_accuracy
                }
    #Return best model and its info
    return best_model, best_model_info

In [1]:
%%time
best_model_rf, best_model_info_rf = optimize_rf(X_train, y_train, X_test, y_test)
best_model_info_rf

NameError: name 'optimize_rf' is not defined

In [None]:
%%time
best_model_dc, best_model_info_dc = optimize_decision_tree(X_train, y_train, X_test, y_test)
best_model_info_dc

### Model Explanation
For both models, I picked out the most reasonable parameter options and loop through to ultimately find the best model for each classification algorithm. After optimizing both **Decision Tree** and **Random Forest** models, I compared their performance and decided to use **Random Forest** for the Lee's house prediction. 
#### Decision Tree:
- **Train Accuracy**: 0.56
- **Test Accuracy**: 0.29
- **Average Cross-Validation Score**: 0.39

#### Random Forest:
- **Train Accuracy**: 0.99
- **Test Accuracy**: 0.40
- **Average Cross-Validation Score**: 0.40

After comparing the performance of both models, I picked **Random Forest** as my model for the prediction because its accuracy is around 15-20% better than **Decision Tree**.

#### Was your testing accuracy as good as your training?  What did you think happened?
The Random Forest model achieved a **training accuracy of 0.99** but a much lower **testing accuracy of 0.40**, indicating **overfitting**. This means the model memorized the training data instead of learning general patterns, leading to poor performance on unseen data. The average **cross-validation score of 0.40** confirms this issue.

# 3. Making Prediction


## Lee's House
Lee purchased a 1,450 sq ft Single Family home (coded as 3) on 2018-04-11, (43201) for $350,000.  The house has 3 bedrooms and 2 baths.  It was built in 1992, and is on a 40,000 square foot lot.  What town do you think it is in?  Use your model to predict.

In [None]:
# Creating Lee House DataFrame
lee_house = pd.DataFrame({
    'Sale_amount': [350000],
    'Sale_date': [43201],
    'Beds': [3],
    'Baths': [2],
    'Sqft_home': [1450],
    'Sqft_lot': [40000],
    'Type': [3],
    'Build_year': [1992]
})

In [None]:
# Using Random Forest to predict
predicted_town = best_model_rf.predict(lee_house)
print("Predicted Town: ", predicted_town[0])