In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("redfin_data_cleaned.csv")

In [None]:
# create df1 that only has records of houses w/ prices above the 5th percentile but less than $1000000
df1 = df[(df.PRICE > df.PRICE.quantile(0.05)) & (df.PRICE < 1000000)]

# create df2 to select only revelant columns for model building from df1
df2 = df1[["CITY","MONTH","BEDROOMS","BATHROOMS","SQFT","LOTSIZE","YEARBUILT","PRICE"]]

In [None]:
# removing outliers
from scipy import stats

z_scores = stats.zscore(df2.PRICE)
abs_z_scores = np.abs(z_scores)
df2["z_scores"] = abs_z_scores

df_new = df2[df2["z_scores"]<3]

In [None]:
# transform categorical column labels to numerical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_new.CITY = le.fit_transform(df1.CITY)
df_new.MONTH = le.fit_transform(df1.MONTH)

In [None]:
# create X and y variables 
X = df_new.iloc[:,0:7].values

y = df_new.PRICE

In [None]:
# solves numeric ordering of nominal categorical features 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories= "auto"),[0,1])],remainder='passthrough')

X = ct.fit_transform(X).toarray()

In [None]:
# data split, train and test set
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
# linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lm = LinearRegression()
lm.fit(X_train,y_train)

np.mean(cross_val_score(lm,X_train,y_train,scoring = "neg_mean_absolute_error",cv=10))

In [None]:
# linear regression model's predictions vs actual values 
Y_pred = lm.predict(X_test)

import seaborn as sns 
sns.scatterplot(y_test, Y_pred, alpha=0.5)

print(lm.score(X_test,y_test))

In [None]:
# random forest regressor model 
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train,y_train)

np.mean(cross_val_score(rf,X_train,y_train,scoring="neg_mean_absolute_error",cv=10))

In [None]:
# random forest regressor model's predictions vs actual values 
r_pred = rf.predict(X_test)

sns.scatterplot(y_test, r_pred, alpha=0.5)

print(rf.score(X_test,y_test))

In [None]:
# gradient boosting regressor model 
import xgboost as xgb
from sklearn.metrics import mean_squared_error 

xgbr = xgb.XGBRegressor(verbosity = 0)
xgbr.fit(X_train,y_train)

np.mean(cross_val_score(xgbr,X_train,y_train,scoring="neg_mean_absolute_error",cv=10))

In [None]:
# gradient boosting regressor model's predictions vs actual values 
x_pred = xgbr.predict(X_test)

sns.scatterplot(y_test, x_pred, alpha=0.5)

print(rf.score(X_test,y_test))

In [None]:
# test ensembles, average error in each model 
from sklearn.metrics import mean_absolute_error

print("Linear Regression: ",mean_absolute_error(y_test,Y_pred))
print("Random Forest Regressor: ",mean_absolute_error(y_test,r_pred))
print("Gradient Boosting Regressor: ",mean_absolute_error(y_test,x_pred))