In [1]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.3f}'.format

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from statsmodels.formula.api import ols
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

import model

In [2]:
train = pd.read_csv('train_iqr.csv')

In [3]:
# We are trying to predict Value based on baths, beds and sq. ft
y = train[['value']]
X = train[['bathrooms', 'bedrooms', 'square_feet']]

In [4]:
# Create predictions dataframe
predictions = pd.DataFrame({
    'actual': train.value,
    'baseline': train.value.mean()
})
predictions

Unnamed: 0,actual,baseline
0,0.006,0.353
1,0.896,0.353
2,-0.604,0.353
3,0.463,0.353
4,0.091,0.353
...,...,...
11986,-0.367,0.353
11987,-0.669,0.353
11988,0.176,0.353
11989,0.558,0.353


In [5]:
X_bath = train[['bathrooms']]

lm_bath = LinearRegression().fit(X_bath, y)
predictions['bath_lm'] = lm_bath.predict(X_bath)

In [6]:
X_bed = train[['bedrooms']]

lm_bed = LinearRegression().fit(X_bed, y)
predictions['bed_lm'] = lm_bed.predict(X_bed)

In [7]:
X_sf = train[['square_feet']]

lm_sf = LinearRegression().fit(X_sf, y)
predictions['sf_lm'] = lm_sf.predict(X_sf)

In [8]:
X_poly = train[['bathrooms', 'bedrooms', 'square_feet']]

lm_poly = LinearRegression().fit(X_poly, y)
predictions['poly'] = lm_poly.predict(X_poly)

In [9]:
# use RFE to select top 2
model.select_rfe(X_poly, y, 2)

['bedrooms', 'square_feet']


In [10]:
X_rfe = train[['bathrooms', 'bedrooms']]

lm_rfe = LinearRegression().fit(X_rfe, y)
predictions['rfe'] = lm_rfe.predict(X_rfe)

In [11]:
# But given the correlations, I think sq feet is more important than bedrooms
X_corr = train[['bathrooms', 'square_feet']]

lm_corr = LinearRegression().fit(X_corr, y)
predictions['corr'] = lm_corr.predict(X_corr)

In [12]:
# make an ols model with all 3
ols_model = ols('value ~ bathrooms + bedrooms + square_feet', data=train).fit()
predictions['ols'] = ols_model.predict(X)

In [13]:
predictions

Unnamed: 0,actual,baseline,bath_lm,bed_lm,sf_lm,poly,rfe,corr,ols
0,0.006,0.353,0.497,0.688,1.174,1.044,0.405,1.166,1.044
1,0.896,0.353,-0.763,-0.293,-0.214,0.002,-0.679,-0.226,0.002
2,-0.604,0.353,0.917,0.688,0.986,0.901,0.872,0.987,0.901
3,0.463,0.353,0.917,0.198,0.262,0.441,1.031,0.273,0.441
4,0.091,0.353,-0.763,-0.293,-0.835,-0.673,-0.679,-0.839,-0.673
...,...,...,...,...,...,...,...,...,...
11986,-0.367,0.353,0.497,0.688,0.456,0.263,0.405,0.457,0.263
11987,-0.669,0.353,0.077,0.198,-0.528,-0.543,0.097,-0.522,-0.543
11988,0.176,0.353,0.077,0.198,-0.073,-0.048,0.097,-0.073,-0.048
11989,0.558,0.353,0.497,0.198,-0.059,0.029,0.564,-0.052,0.029


In [14]:
# now set up the evaluation functions
# I want the RMSE of each model

In [15]:
RMSE_bl = model.RMSE(predictions.actual, predictions.baseline)
RMSE_bath = model.RMSE(predictions.actual, predictions.bath_lm)
RMSE_bed = model.RMSE(predictions.actual, predictions.bed_lm)
RMSE_sf = model.RMSE(predictions.actual, predictions.sf_lm)
RMSE_poly = model.RMSE(predictions.actual, predictions.poly)
RMSE_rfe = model.RMSE(predictions.actual, predictions.rfe)
RMSE_corr = model.RMSE(predictions.actual, predictions['corr'])
RMSE_ols = model.RMSE(predictions.actual, predictions.ols)

In [16]:
eval = pd.DataFrame(np.array(['baseline', 'bath', 'bed', 'sq_feet', 'poly', 'rfe', 'corr', 'ols']), columns=['models'])
eval['model_errors'] = np.array([RMSE_bl, RMSE_bath, RMSE_bed, RMSE_sf, RMSE_poly, RMSE_rfe, RMSE_corr, RMSE_ols])

eval.sort_values(by='model_errors')

Unnamed: 0,models,model_errors
4,poly,1.332
7,ols,1.332
6,corr,1.352
3,sq_feet,1.352
5,rfe,1.439
1,bath,1.443
2,bed,1.613
0,baseline,1.678


In [17]:
# This means that the model made using all 3 values performs the best
# Also, all models performed better than the baseline