In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from statsmodels.formula.api import ols
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

import model

In [6]:
y = pd.read_csv('train.csv')
y = y[['value']]

In [8]:
# We are trying to predict Value based on baths, beds and sq. ft
train = pd.read_csv('train_iqr.csv')
X = train[['bathrooms', 'bedrooms', 'square_feet']]

In [9]:
# Create predictions dataframe
predictions = pd.DataFrame({
    'actual': y.value,
    'baseline': y.value.mean()
})
predictions

Unnamed: 0,actual,baseline
0,386909.000,540348.743
1,780000.000,540348.743
2,117583.000,540348.743
3,588770.000,540348.743
4,424532.000,540348.743
...,...,...
11986,221876.000,540348.743
11987,88594.000,540348.743
11988,461938.000,540348.743
11989,630544.000,540348.743


In [10]:
X_bath = train[['bathrooms']]

lm_bath = LinearRegression().fit(X_bath, y)
predictions['bath_lm'] = lm_bath.predict(X_bath)

In [11]:
X_bed = train[['bedrooms']]

lm_bed = LinearRegression().fit(X_bed, y)
predictions['bed_lm'] = lm_bed.predict(X_bed)

In [12]:
X_sf = train[['square_feet']]

lm_sf = LinearRegression().fit(X_sf, y)
predictions['sf_lm'] = lm_sf.predict(X_sf)

In [13]:
X_poly = train[['bathrooms', 'bedrooms', 'square_feet']]

lm_poly = LinearRegression().fit(X_poly, y)
predictions['poly'] = lm_poly.predict(X_poly)

In [14]:
# use RFE to select top 2
model.select_rfe(X_poly, y, 2)

['bedrooms', 'square_feet']


In [16]:
X_rfe = train[['bedrooms', 'square_feet']]

lm_rfe = LinearRegression().fit(X_rfe, y)
predictions['rfe'] = lm_rfe.predict(X_rfe)

In [17]:
# But given the correlations, I think bathrooms is more important than bedrooms
X_corr = train[['bathrooms', 'square_feet']]

lm_corr = LinearRegression().fit(X_corr, y)
predictions['corr'] = lm_corr.predict(X_corr)

In [22]:
# make an ols model with all 3
ols_train = pd.DataFrame.join(train, y)
ols_model = ols('value ~ bathrooms + bedrooms + square_feet', data=ols_train).fit()
predictions['ols'] = ols_model.predict(X)

In [23]:
predictions

Unnamed: 0,actual,baseline,bath_lm,bed_lm,sf_lm,poly,rfe,corr,ols
0,386909.000,540348.743,603893.417,688320.114,902648.361,845332.535,877236.031,899164.903,845332.535
1,780000.000,540348.743,47173.361,254923.752,289533.670,385061.410,421183.598,284273.322,385061.410
2,117583.000,540348.743,789466.769,688320.114,819663.795,782389.631,778946.433,820396.336,782389.631
3,588770.000,540348.743,789466.769,471621.933,500084.935,579163.012,535497.115,504847.524,579163.012
4,424532.000,540348.743,47173.361,254923.752,15419.758,86678.163,96514.235,13616.122,86678.163
...,...,...,...,...,...,...,...,...,...
11986,221876.000,540348.743,603893.417,688320.114,585717.945,500341.921,501853.418,586231.136,500341.921
11987,88594.000,540348.743,418320.065,471621.933,150931.789,144318.951,121948.860,153758.325,144318.951
11988,461938.000,540348.743,418320.065,471621.933,351772.094,362941.137,359830.600,352065.935,362941.137
11989,630544.000,540348.743,603893.417,471621.933,357951.796,397056.873,367150.038,361337.231,397056.873


In [14]:
# now set up the evaluation functions
# I want the RMSE of each model

In [24]:
RMSE_bl = model.RMSE(predictions.actual, predictions.baseline)
RMSE_bath = model.RMSE(predictions.actual, predictions.bath_lm)
RMSE_bed = model.RMSE(predictions.actual, predictions.bed_lm)
RMSE_sf = model.RMSE(predictions.actual, predictions.sf_lm)
RMSE_poly = model.RMSE(predictions.actual, predictions.poly)
RMSE_rfe = model.RMSE(predictions.actual, predictions.rfe)
RMSE_corr = model.RMSE(predictions.actual, predictions['corr'])
RMSE_ols = model.RMSE(predictions.actual, predictions.ols)

In [25]:
eval = pd.DataFrame(np.array(['baseline', 'bath', 'bed', 'sq_feet', 'poly', 'rfe', 'corr', 'ols']), columns=['models'])
eval['model_errors'] = np.array([RMSE_bl, RMSE_bath, RMSE_bed, RMSE_sf, RMSE_poly, RMSE_rfe, RMSE_corr, RMSE_ols])

eval.sort_values(by='model_errors')

Unnamed: 0,models,model_errors
4,poly,588589.536
7,ols,588589.536
5,rfe,589249.087
6,corr,597338.495
3,sq_feet,597347.745
1,bath,637657.044
2,bed,712699.608
0,baseline,741315.642


In [17]:
# This means that the model made using all 3 values performs the best
# Also, all models performed better than the baseline