##### 1. Introduction
This notebook focuses on predicting real estate prices in Arizona. We will first use traditional machine learning, followed by a deep learning approach, to understand the nuances and performances of each.

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow import keras


2. Data Loading and Preprocessing


In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('scraped_data.csv')

# Convert 'SOLD DATE' to datetime format
df['SOLD DATE'] = pd.to_datetime(df['SOLD DATE'])
df['SALE_YEAR'] = df['SOLD DATE'].dt.year
df['SALE_MONTH'] = df['SOLD DATE'].dt.month

print(df.shape)

drop_columns = ["SALE TYPE", "ADDRESS", "CITY", "STATE OR PROVINCE", "SOLD DATE",
                "$/SQUARE FEET", "STATUS", "NEXT OPEN HOUSE START TIME", 
                "NEXT OPEN HOUSE END TIME", 
                "URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)", 
                "SOURCE", "MLS#", "FAVORITE", "INTERESTED"]
df = df.drop(columns=drop_columns)

# Handle missing values for numeric columns
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Handle missing values for non-numeric columns (e.g., fill with the most frequent value)
for col in df.select_dtypes(exclude=['float64', 'int64']).columns:
    mode_value = df[col].mode()
    if not mode_value.empty:
        df[col].fillna(mode_value.iloc[0], inplace=True)


# One-hot encode the PROPERTY TYPE and LOCATION columns
df = pd.get_dummies(df, columns=["PROPERTY TYPE", "LOCATION"], drop_first=True)

# Split data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2

X_train = train_df.drop(columns=["PRICE"])
y_train = train_df["PRICE"]

X_val = val_df.drop(columns=["PRICE"])
y_val = val_df["PRICE"]

X_test = test_df.drop(columns=["PRICE"])
y_test = test_df["PRICE"]

(0, 29)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

3. Traditional Machine Learning Implementation (e.g., Linear Regression)


In [None]:
pd.set_option('display.max_rows', None)
columns_with_nan = X_train.columns[X_train.isnull().any()].tolist()
print(columns_with_nan)



# initialize the Linear Regression model
lr = LinearRegression()

# train the model using the training data
lr.fit(X_train, y_train)

# predict on validation set and compute metrics
y_pred_val = lr.predict(X_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
print(f"validation MAE for Linear Regression: {mae_val}")
print(f"validation MSE for Linear Regression: {mse_val}")

# predict on test set and compute metrics
y_pred_test = lr.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f"test MAE for Linear Regression: {mae_test}")
print(f"test MSE for Linear Regression: {mse_test}")



['DAYS ON MARKET']


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

4. Neural Network Implementation using Keras & TensorFlow


In [None]:
# design the neural network architecture
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

# compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# train the model using the training data
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=0)

# evaluate the model on the validation and test sets
val_loss, val_mae = model.evaluate(X_val, y_val, verbose=0)
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f"validation MAE for Neural Network: {val_mae}")
print(f"test MAE for Neural Network: {test_mae}")

