# <span style="color:rgb(255, 0, 255)">VOL 2.</span> 

# <span style="color:rgb(255, 0, 255)">This document contains the different tries we made regarding feature engineering and log transforming</span> 

Here we only have the code related to machine learning model tries. If you want to review all the steps we took please review the final notebook.

### <span style="color:rgb(255, 0, 255)">--- Import the necessary libraries</span>

In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import matplotlib.ticker as mk
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import os #we will use the function listdir to list files in a folder
import math #to apply absolute value

### <span style="color:rgb(255, 0, 255)">--- Function we use for modeling results</span>


In [2]:
def modeling(y, X, models=[], test_size=0.25):
    for model in models:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        model.fit(X_train, y_train )
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        # r2_adj = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
        r2_adj =  1 - (1-model.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)
        RMSE = mean_squared_error(y_test, predictions, squared=False)
        MSE = mean_squared_error(y_test, predictions)
        MAE = mean_absolute_error(y_test, predictions)
        print(model, 'metrics are: '), print("R2 =", r2), print("R2 adjusted =", r2_adj), print("RMSE =", round(RMSE,2)), print("MSE =", round(MSE,2)), print("MAE =", round(MAE,2))
    return predictions, y_test, r2

### <span style="color:rgb(255, 0, 255)">--- Load the database</span>

In [3]:
df = pd.read_csv('df6_outliers_best.csv')
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,year_bought
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,2014
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,2014
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,2015
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,2014
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,2015


# 🆙 With a lot of feature engineering


#### It slightly improved the model

In [4]:
df["yr_diff_renovated"] = df["yr_renovated"] - df["yr_built"]
df["yr_diff_bought"] = df["year_bought"] - df["yr_built"]

In [5]:
df["sqrt_diff"] = df["sqft_lot"] - df["sqft_living"]
df["sqrt_diff_15"] = df["sqft_lot15"] - df["sqft_living15"]

In [6]:
df["yr_diff_renovated"]= df["yr_diff_renovated"].apply(lambda x: x if x>0 else 0)
df["yr_renovated_binary"] = df["yr_diff_renovated"].apply(lambda x: 1 if x>0 else 0)
df["basement_binary"] = df["sqft_basement"].apply(lambda x: 1 if x>0 else 0)

In [7]:
#encoding zipcode
def zip_to_binary(zipcode):
    if str(zipcode).startswith('980'):
        return 0
    elif str(zipcode).startswith('981'):
        return 1
    else:
        return None

df['zip_binary'] = df['zipcode'].apply(zip_to_binary)

df.to_csv('df_fea_eng.csv', index=False) # this is the point where we will start the following tries

In [8]:
X_num = df.drop( columns = ["price"], axis = 1)
transformer = MinMaxScaler().fit(X_num)
X_num_minmax = transformer.transform(X_num)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 28)

In [9]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [10]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7233795715766621
R2 adjusted = 0.7190654375067649
RMSE = 155704.04
MSE = 24243748060.76
MAE = 105401.21
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7269789241684084
R2 adjusted = 0.7583170688640712
RMSE = 154687.72
MSE = 23928291252.63
MAE = 92648.44
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8122993682166718
R2 adjusted = 0.8929474441947544
RMSE = 128259.82
MSE = 16450581230.53
MAE = 76736.98
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8717606650253712
R2 adjusted = 0.9174262993160112
RMSE = 106015.26
MSE = 11239235461.84
MAE = 64892.12


# 🆙 With log transformation


#### Applying log transformation in the columns that were more skewed slightly improved the model

In [11]:
df1 = pd.read_csv('df_fea_eng.csv')
df1.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,year_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,yr_renovated_binary,basement_binary,zip_binary
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,2014,0,59,4470,4310,0,0,1
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,2014,40,63,4672,5949,1,1,1
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,2015,0,82,9230,5342,0,0,0
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,2014,0,49,3040,3640,0,1,1
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,2015,0,28,6400,5703,0,0,0


In [12]:
# sqft_living, sqft_above, sqft_living15
df1["sqft_living_log"] = np.log((df1.sqft_living))
df1["sqft_above_log"] = np.log((df1.sqft_above))
df1["sqft_living15_log"] = np.log((df1.sqft_living15))

In [13]:
df1.to_csv('df1_fea_eng_log.csv', index=False) # this is the point where we will start the following tries

In [14]:
X_num = df1.drop( columns = ["price"], axis = 1)
transformer = MinMaxScaler().fit(X_num)
X_num_minmax = transformer.transform(X_num)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 31)

In [15]:
X = X_num_normalized.copy()
Y = df1["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [16]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.734757543931849
R2 adjusted = 0.7374227741378636
RMSE = 152468.2
MSE = 23246552384.39
MAE = 103301.55
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7414432431478215
R2 adjusted = 0.7744587734763055
RMSE = 150534.38
MSE = 22660599971.81
MAE = 89829.45
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8160404597604574
R2 adjusted = 0.8937661929342473
RMSE = 126975.2
MSE = 16122702044.68
MAE = 76598.69
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8707967275418593
R2 adjusted = 0.9176000850909983
RMSE = 106412.96
MSE = 11323717499.66
MAE = 64792.99


# 🆙Treating numericals binary as categoricals and one hot encoding them


#### As for now we have treated some features that are binomial as numericals discrete. What if we treat them as categoricals and do a one-hot encoding?

#### Answer: it slightly improves the code

In [17]:
df2 = pd.read_csv('df1_fea_eng_log.csv')
df2.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,year_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,yr_renovated_binary,basement_binary,zip_binary,sqft_living_log,sqft_above_log,sqft_living15_log
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,2014,0,59,4470,4310,0,0,1,7.07327,7.07327,7.200425
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,2014,40,63,4672,5949,1,1,1,7.851661,7.682482,7.432484
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,2015,0,82,9230,5342,0,0,0,6.646391,6.646391,7.908387
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,2014,0,49,3040,3640,0,1,1,7.5807,6.956545,7.21524
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,2015,0,28,6400,5703,0,0,0,7.426549,7.426549,7.495542


In [18]:
# select the columns we want to encode
df2_binary = df2[["waterfront", "year_bought", "yr_renovated_binary", "basement_binary", "zip_binary"]]
df2_binary.head()

Unnamed: 0,waterfront,year_bought,yr_renovated_binary,basement_binary,zip_binary
0,0,2014,0,0,1
1,0,2014,1,1,1
2,0,2015,0,0,0
3,0,2014,0,1,1
4,0,2015,0,0,0


In [19]:
waterfront_enc = df2['waterfront']
waterfront_enc_dumm = pd.get_dummies(waterfront_enc, drop_first=False)
waterfront_enc_dumm

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
18618,1,0
18619,1,0
18620,1,0
18621,1,0


In [20]:
year_bought_enc = df2['year_bought']
year_bought_enc_dumm = pd.get_dummies(year_bought_enc, drop_first=False)
year_bought_enc_dumm

Unnamed: 0,2014,2015
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1
...,...,...
18618,1,0
18619,0,1
18620,1,0
18621,0,1


In [21]:
yr_renovated_binary_enc = df2['yr_renovated_binary']
yr_renovated_binary_enc_dumm = pd.get_dummies(yr_renovated_binary_enc, drop_first=False)
yr_renovated_binary_enc_dumm

Unnamed: 0,0,1
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
...,...,...
18618,1,0
18619,1,0
18620,1,0
18621,1,0


In [22]:
basement_binary_enc = df2['basement_binary']
basement_binary_enc_dumm = pd.get_dummies(basement_binary_enc, drop_first=False)
basement_binary_enc_dumm

Unnamed: 0,0,1
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
18618,1,0
18619,1,0
18620,1,0
18621,1,0


In [23]:
zip_binary_enc = df2['zip_binary']
zip_binary_enc_dumm = pd.get_dummies(zip_binary_enc, drop_first=False)
zip_binary_enc_dumm

Unnamed: 0,0,1
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
18618,0,1
18619,0,1
18620,0,1
18621,1,0


In [24]:
df2_binary_encoded = pd.concat([waterfront_enc_dumm, year_bought_enc_dumm, yr_renovated_binary_enc_dumm, basement_binary_enc_dumm, zip_binary_enc_dumm], axis=1)
df2_binary_encoded.head()

Unnamed: 0,0,1,2014,2015,0.1,1.1,0.2,1.2,0.3,1.3
0,1,0,1,0,1,0,1,0,0,1
1,1,0,1,0,0,1,0,1,0,1
2,1,0,0,1,1,0,1,0,1,0
3,1,0,1,0,1,0,0,1,0,1
4,1,0,0,1,1,0,1,0,1,0


In [25]:
# Concat the one-hot encoded columns with the original dataframe
df2 = pd.concat([df2, df2_binary_encoded], axis=1)

In [26]:
#Drop the original categorical columns
df2 = df2.drop(df2_binary, axis=1)
df2.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,sqft_living_log,sqft_above_log,sqft_living15_log,0,1,2014,2015,0.1,1.1,0.2,1.2,0.3,1.3
0,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,0,59,4470,4310,7.07327,7.07327,7.200425,1,0,1,0,1,0,1,0,0,1
1,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,40,63,4672,5949,7.851661,7.682482,7.432484,1,0,1,0,0,1,0,1,0,1
2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,0,82,9230,5342,6.646391,6.646391,7.908387,1,0,0,1,1,0,1,0,1,0
3,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,0,49,3040,3640,7.5807,6.956545,7.21524,1,0,1,0,1,0,0,1,0,1
4,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,0,28,6400,5703,7.426549,7.426549,7.495542,1,0,0,1,1,0,1,0,1,0


df2.to_csv('df2_fea_eng_log_enc.csv', index=False) # this is the point where we will start the following tries

In [27]:
X_num = df2.drop( columns = ["price"], axis = 1)
transformer = MinMaxScaler().fit(X_num)
X_num_minmax = transformer.transform(X_num)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 36)

In [28]:
X = X_num_normalized.copy()
Y = df2["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [29]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7352034238037304
R2 adjusted = 0.7374580725777847
RMSE = 152340.0
MSE = 23207474289.75
MAE = 103113.21
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7401444972439519
R2 adjusted = 0.7718073735003297
RMSE = 150911.98
MSE = 22774425507.65
MAE = 89975.35
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8137236883783419
R2 adjusted = 0.8932197445835817
RMSE = 127772.26
MSE = 16325750033.66
MAE = 77071.41
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8742026013417905
R2 adjusted = 0.9184751648072851
RMSE = 105001.04
MSE = 11025217686.03
MAE = 64734.31


# 🟧Bucketing some numerical discrete features


#### Now let's try to reduce some discrete numerical features that look a little bit odd and see how this impacts in the model
#### ANSWER: It was worse

In [30]:
df3 = pd.read_csv('df2_fea_eng_log_enc.csv')
df3.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,sqft_living_log,sqft_above_log,sqft_living15_log,0,1,2014,2015,0.1,1.1,0.2,1.2,0.3,1.3
0,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,0,59,4470,4310,7.07327,7.07327,7.200425,1,0,1,0,1,0,1,0,0,1
1,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,40,63,4672,5949,7.851661,7.682482,7.432484,1,0,1,0,0,1,0,1,0,1
2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,0,82,9230,5342,6.646391,6.646391,7.908387,1,0,0,1,1,0,1,0,1,0
3,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,0,49,3040,3640,7.5807,6.956545,7.21524,1,0,1,0,1,0,0,1,0,1
4,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,0,28,6400,5703,7.426549,7.426549,7.495542,1,0,0,1,1,0,1,0,1,0


In [31]:
#Inspired by Luis's Code
#Check unique values for each value
for column in df3.columns:
    print('─' * 10)
    print("This feature ", '\033[1m' + column + '\033[0m' ," has ", df[column].nunique(), " categories \n The single values are: ", df[column].unique(),"\n" )
    print("Here the detail: \n" , df[column].value_counts())
    print("\n\n")

──────────
This feature  [1mbedrooms[0m  has  12  categories 
 The single values are:  [ 3  2  4  5  1  6  7  8  9 11 10 33] 

Here the detail: 
 3     8591
4     5757
2     2535
5     1301
6      221
1      167
7       28
8       12
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64



──────────
This feature  [1mbathrooms[0m  has  26  categories 
 The single values are:  [1.   2.25 3.   2.   1.5  2.5  1.75 2.75 3.25 3.5  0.75 4.25 3.75 4.
 1.25 4.5  5.   0.5  5.25 4.75 6.   6.75 7.5  5.5  5.75 6.5 ] 

Here the detail: 
 2.50    4586
1.00    3549
1.75    2653
2.25    1748
2.00    1716
1.50    1321
2.75     982
3.00     624
3.50     576
3.25     448
3.75     114
4.00      90
4.50      69
4.25      50
0.75      47
4.75      15
1.25       9
5.00       8
5.25       6
0.50       4
5.50       3
6.00       1
6.75       1
7.50       1
5.75       1
6.50       1
Name: bathrooms, dtype: int64



──────────
This feature  [1msqft_living[0m  has  880  categories 
 The si

KeyError: 'sqft_living_log'

we are going for: bathrooms and floors

In [32]:
df3["bathrooms"].unique()

array([1.  , 2.25, 3.  , 2.  , 1.5 , 2.5 , 1.75, 2.75, 3.25, 3.5 , 0.75,
       4.25, 3.75, 4.  , 1.25, 4.5 , 5.  , 0.5 , 5.25, 4.75, 6.  , 6.75,
       7.5 , 5.5 , 5.75, 6.5 ])

In [33]:
# Create the mapping dictionary
bucket_map = {1.: 1, 2.25: 2, 3.: 3, 2.: 2, 1.5: 1, 2.5: 2, 1.75: 2, 2.75: 3, 3.25: 3, 3.5: 3, 0.75: 1, 4.25: 4, 3.75: 4, 4.: 4, 1.25: 1, 4.5: 4, 5.: 5, 0.5: 0, 5.25: 5, 4.75: 5, 6.: 6, 6.75: 7, 7.5: 7, 5.5: 5, 5.75: 6, 6.5: 6}

# Apply the mapping to the "bathrooms" column and store the result in a new column "bucket_bathrooms"
df3['bucket_bathrooms'] = [bucket_map[value] for value in df['bathrooms']]
df3['bucket_bathrooms'].isna().sum()

0

In [34]:
df3['bucket_bathrooms'].unique()

array([1, 2, 3, 4, 5, 0, 6, 7], dtype=int64)

In [35]:
# Create the mapping dictionary
floor_map = {1.0: 1, 2.0: 2, 1.5: 1, 3.0: 3, 2.5: 2, 3.5: 3}

# Apply the mapping to the "floors" column and store the result in a new column "bucket_floors"
df3['bucket_floors'] = [floor_map[value] for value in df['floors']]
df3['bucket_floors'].isna().sum()

0

In [36]:
df3['bucket_floors'].unique()

array([1, 2, 3], dtype=int64)

In [42]:
df3 = df3.drop(columns = ["bathrooms", "floors"], axis = 1)

In [43]:
df3.head()

Unnamed: 0,bedrooms,sqft_living,sqft_lot,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,sqft_living_log,sqft_above_log,sqft_living15_log,0,1,2014,2015,0.1,1.1,0.2,1.2,0.3,1.3,bucket_bathrooms,bucket_floors
0,3,1180,5650,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,0,59,4470,4310,7.07327,7.07327,7.200425,1,0,1,0,1,0,1,0,0,1,1,1
1,3,2570,7242,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,40,63,4672,5949,7.851661,7.682482,7.432484,1,0,1,0,0,1,0,1,0,1,2,2
2,2,770,10000,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,0,82,9230,5342,6.646391,6.646391,7.908387,1,0,0,1,1,0,1,0,1,0,1,1
3,4,1960,5000,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,0,49,3040,3640,7.5807,6.956545,7.21524,1,0,1,0,1,0,0,1,0,1,3,1
4,3,1680,8080,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,0,28,6400,5703,7.426549,7.426549,7.495542,1,0,0,1,1,0,1,0,1,0,2,1


In [44]:
df3.to_csv('df2_fea_eng_log_enc_bucket.csv', index=False) # this is the point where we will start the following tries

In [45]:
X_num = df3.drop( columns = ["price"], axis = 1)
transformer = MinMaxScaler().fit(X_num)
X_num_minmax = transformer.transform(X_num)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 36)

In [46]:
X = X_num_normalized.copy()
Y = df3["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [47]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7349434779778048
R2 adjusted = 0.7375155690671634
RMSE = 152414.75
MSE = 23230256631.42
MAE = 103081.63
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7321510675437917
R2 adjusted = 0.7674642035730816
RMSE = 153215.5
MSE = 23474990888.51
MAE = 90856.84
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8084525337200795
R2 adjusted = 0.8919767414086341
RMSE = 129567.47
MSE = 16787728009.23
MAE = 77350.89
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8723816729118441
R2 adjusted = 0.9176753370536769
RMSE = 105758.26
MSE = 11184808683.49
MAE = 64922.25
