# <span style="color:rgb(255, 0, 255)">VOL 3.</span> 

# <span style="color:rgb(255, 0, 255)">This document contains the different tries we made regarding feature importance and coeficient regression evaluation

Here we only have the code related to machine learning model tries. If you want to review all the steps we took please review the final notebook.

### <span style="color:rgb(255, 0, 255)">--- Import the necessary libraries</span>

In [6]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import matplotlib.ticker as mk
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import os #we will use the function listdir to list files in a folder
import math #to apply absolute value

### <span style="color:rgb(255, 0, 255)">--- Function we use for modeling results</span>


In [7]:
def modeling(y, X, models=[], test_size=0.25):
    for model in models:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        model.fit(X_train, y_train )
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        # r2_adj = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
        r2_adj =  1 - (1-model.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)
        RMSE = mean_squared_error(y_test, predictions, squared=False)
        MSE = mean_squared_error(y_test, predictions)
        MAE = mean_absolute_error(y_test, predictions)
        print(model, 'metrics are: '), print("R2 =", r2), print("R2 adjusted =", r2_adj), print("RMSE =", round(RMSE,2)), print("MSE =", round(MSE,2)), print("MAE =", round(MAE,2))
    return predictions, y_test, r2

### <span style="color:rgb(255, 0, 255)">--- Load the database</span>

In [8]:
# opening the dataset we used in Vol II with the best results for the model. 
df = pd.read_csv('df2_fea_eng_log_enc.csv')
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,day_bought,month_bought,yr_diff_renovated,yr_diff_bought,sqrt_diff,sqrt_diff_15,sqft_living_log,sqft_above_log,sqft_living15_log,0,1,2014,2015,0.1,1.1,0.2,1.2,0.3,1.3
0,3,1.0,1180,5650,1.0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,13,10,0,59,4470,4310,7.07327,7.07327,7.200425,1,0,1,0,1,0,1,0,0,1
1,3,2.25,2570,7242,2.0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,9,12,40,63,4672,5949,7.851661,7.682482,7.432484,1,0,1,0,0,1,0,1,0,1
2,2,1.0,770,10000,1.0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,25,2,0,82,9230,5342,6.646391,6.646391,7.908387,1,0,0,1,1,0,1,0,1,0
3,4,3.0,1960,5000,1.0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,9,12,0,49,3040,3640,7.5807,6.956545,7.21524,1,0,1,0,1,0,0,1,0,1
4,3,2.0,1680,8080,1.0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,18,2,0,28,6400,5703,7.426549,7.426549,7.495542,1,0,0,1,1,0,1,0,1,0


In [9]:
X_num = df.drop( columns = ["price"], axis = 1)
transformer = MinMaxScaler().fit(X_num)
X_num_minmax = transformer.transform(X_num)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 36)

In [10]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [11]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7352034238037304
R2 adjusted = 0.7374580725777847
RMSE = 152340.0
MSE = 23207474289.75
MAE = 103113.21
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7401444972439519
R2 adjusted = 0.7718073735003297
RMSE = 150911.98
MSE = 22774425507.65
MAE = 89975.35
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8052608621158373
R2 adjusted = 0.8913280437049484
RMSE = 130642.47
MSE = 17067454574.28
MAE = 77516.84
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8729578545401561
R2 adjusted = 0.9178841206550782
RMSE = 105519.24
MSE = 11134310597.31
MAE = 64750.39


# Looking at coefficient values and feature importance 


In [12]:
# Coefficient values

reg = LinearRegression()
reg.fit(X_train, y_train)

# Get the coefficients and feature names
coef = reg.coef_
feature_names = df.columns

# Print the coefficients and feature names
for feature, coef in zip(feature_names, coef):
    print(f"{feature}: {coef:.2f}")
    
# high negative of positive coef: bedrooms,sqft_living, sqft_lot, floors, waterfront, view, grade, sqft_above, yr_built, yr_renovated, zipcode, lat, sqft_lot15, day_bought, year_bought, yr_diff_renovated, yr_diff_bought, sqrt_diff, sqrt_diff_15, yr_renovated_binary     

bedrooms: -346712.91
bathrooms: 269276.99
sqft_living: 9273051211467134.00
sqft_lot: 7653669994066192.00
floors: -3372.54
view: 166158.30
condition: 148419.17
grade: 1017437.82
sqft_above: -8101164635282898.00
sqft_basement: -6802720129114478.00
yr_built: 1631028301086405888.00
yr_renovated: 9297536.42
zipcode: -245375.82
lat: 329227.08
long: 33480.51
sqft_living15: -1982975443011711.75
sqft_lot15: 5598357749823496.00
price: -7846.57
day_bought: 3106.23
month_bought: -141752.75
yr_diff_renovated: 1645211155878705408.00
yr_diff_bought: -8009616410217674.00
sqrt_diff: -5961967173475863.00
sqrt_diff_15: -1991021.25
sqft_living_log: 593804.23
sqft_above_log: 136172.18
sqft_living15_log: -3502099702478090.50
0: -3502099701905529.50
1: 7091427396012665.00
2014: -7091427396012665.00
2015: 167513690321293.66
0.1: 167513681228607.19
1.1: 4777275583938763.00
0.2: 4777275583999388.00
1.2: 2086535770089702.50
0.3: 2086535770225827.50


### <span style="color:rgb(255, 0, 255)"> Feature importance


In [13]:
from sklearn.linear_model import LinearRegression

# Fit a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Get the coefficient of determination (R^2)
r_squared = lr.score(X_train, y_train)
print(f"R^2 score: {r_squared:.2f}")

# Get the feature importance scores
importances = abs(lr.coef_)

# Print the feature importance scores
for feature, importance in zip(df.columns, importances):
    print(f"{feature}: {importance:.2f}")

    

R^2 score: 0.74
bedrooms: 346712.91
bathrooms: 269276.99
sqft_living: 9273051211467134.00
sqft_lot: 7653669994066192.00
floors: 3372.54
view: 166158.30
condition: 148419.17
grade: 1017437.82
sqft_above: 8101164635282898.00
sqft_basement: 6802720129114478.00
yr_built: 1631028301086405888.00
yr_renovated: 9297536.42
zipcode: 245375.82
lat: 329227.08
long: 33480.51
sqft_living15: 1982975443011711.75
sqft_lot15: 5598357749823496.00
price: 7846.57
day_bought: 3106.23
month_bought: 141752.75
yr_diff_renovated: 1645211155878705408.00
yr_diff_bought: 8009616410217674.00
sqrt_diff: 5961967173475863.00
sqrt_diff_15: 1991021.25
sqft_living_log: 593804.23
sqft_above_log: 136172.18
sqft_living15_log: 3502099702478090.50
0: 3502099701905529.50
1: 7091427396012665.00
2014: 7091427396012665.00
2015: 167513690321293.66
0.1: 167513681228607.19
1.1: 4777275583938763.00
0.2: 4777275583999388.00
1.2: 2086535770089702.50
0.3: 2086535770225827.50


In [15]:
# let's dive in feature importance of linnear regression model


X_feature = X
y_feature = Y

# create a linear regression object
reg = LinearRegression()

# fit the linear regression model on the data
reg.fit(X_feature, y_feature)

# print the coefficients and their corresponding feature names
coefficients = pd.DataFrame({'feature': X_feature.columns, 'coefficient': reg.coef_})
coefficients = coefficients.sort_values('coefficient', ascending=False)
print(coefficients)


              feature   coefficient
20     yr_diff_bought  1.882595e+18
10           yr_built  1.866366e+18
33                1.2  2.264575e+16
32                0.2  2.264575e+16
16         sqft_lot15  1.613061e+16
3            sqft_lot  1.578883e+16
2         sqft_living  5.471140e+15
28               2014  3.634332e+15
30                0.1  3.293716e+14
31                1.1  3.293715e+14
11       yr_renovated  8.632568e+06
7               grade  1.024342e+06
24     sqft_above_log  5.595535e+05
13                lat  3.285020e+05
1           bathrooms  2.239997e+05
5                view  1.767529e+05
6           condition  1.589769e+05
25  sqft_living15_log  1.427734e+05
14               long  2.050574e+04
18       month_bought  5.345845e+03
4              floors  3.568151e+03
17         day_bought -7.184408e+03
19  yr_diff_renovated -1.664068e+05
12            zipcode -2.411595e+05
0            bedrooms -3.722816e+05
23    sqft_living_log -1.848346e+06
27                  1 -7.852

### we ran the model several times without single feautures to see how this impacted the model

In [24]:
X_num1 = df.drop(columns = ["price","yr_diff_bought"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [31]:
X_num2 = df.drop(columns = ["price","yr_built"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [37]:
X_num3 = df.drop(columns = ["price","sqft_lot15"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [43]:
X_num4 = df.drop(columns = ["price","sqft_lot"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [52]:
X_num5 = df.drop(columns = ["price","grade"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [57]:
X_num6 = df.drop(columns = ["price", "sqft_above"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [64]:
X_num7 = df.drop(columns = ["price", "view"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [69]:
X_num8 = df.drop(columns = ["price", "condition"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [75]:
X_num9 = df.drop(columns = ["price", "sqft_living_log"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [80]:
X_num10 = df.drop(columns = ["price", "sqft_living15_log"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [90]:
X_num11 = df.drop(columns = ["price", "1.2", "0.2"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [95]:
X_num12 = df.drop(columns = ["price", "sqft_living"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [100]:
X_num13 = df.drop(columns = ["price", "day_bought"], axis = 1) # good FOR THE MODEL TO DO THIS

In [106]:
X_num14 = df.drop(columns = ["price", "1"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [111]:
X_num15 = df.drop(columns = ["price", "0"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [116]:
X_num16 = df.drop(columns = ["price", "sqft_living15"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [121]:
X_num17 = df.drop(columns = ["price", "sqft_basement"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [126]:
X_num18 = df.drop(columns = ["price", "2014"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [136]:
X_num19 = df.drop(columns = ["price", "0.2"], axis = 1) # good FOR THE MODEL TO DO THIS

In [141]:
X_num20 = df.drop(columns = ["price", "0.1"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [146]:
X_num21 = df.drop(columns = ["price", "1.1"], axis = 1) # good FOR THE MODEL TO DO THIS

In [151]:
X_num22 = df.drop(columns = ["price", "yr_renovated"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [160]:
X_num23 = df.drop(columns = ["price", "sqft_above_log"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [165]:
X_num24 = df.drop(columns = ["price", "lat"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [171]:
X_num25 = df.drop(columns = ["price", "bathrooms"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [177]:
X_num26 = df.drop(columns = ["price", "long"], axis = 1) # good FOR THE MODEL TO DO THIS

In [182]:
X_num27 = df.drop(columns = ["price", "month_bought"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [187]:
X_num28 = df.drop(columns = ["price", "floors"], axis = 1) # good FOR THE MODEL TO DO THIS

In [192]:
X_num29 = df.drop(columns = ["price", "yr_diff_renovated"], axis = 1) # BAD FOR THE MODEL TO DO THIS

In [197]:
X_num30 = df.drop(columns = ["price", "zipcode"], axis = 1) # good FOR THE MODEL TO DO THIS

In [202]:
X_num31 = df.drop(columns = ["price", "bedrooms"], axis = 1) # good FOR THE MODEL TO DO THIS

In [207]:
X_num32 = df.drop(columns = ["price", "1.3"], axis = 1) # good FOR THE MODEL TO DO THIS

In [212]:
X_num33 = df.drop(columns = ["price", "0.3"], axis = 1) # good FOR THE MODEL TO DO THIS

In [217]:
X_num34 = df.drop(columns = ["price", "2015"], axis = 1) # good FOR THE MODEL TO DO THIS

In [222]:
X_num35 = df.drop(columns = ["price", "sqrt_diff"], axis = 1) # good FOR THE MODEL TO DO THIS

In [227]:
X_num35 = df.drop(columns = ["price", "sqrt_diff_15"], axis = 1) # good FOR THE MODEL TO DO THIS

In [228]:
transformer = MinMaxScaler().fit(X_num35)
X_num_minmax = transformer.transform(X_num35)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num35.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 35)

In [229]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [230]:
#Test model
predictions = model.predict(X_test)
predictions.shape

(4656,)

In [231]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7349353222440091
R2 adjusted = 0.737436161095298
RMSE = 152417.1
MSE = 23230971421.56
MAE = 103266.31
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7403220094104754
R2 adjusted = 0.771843498286468
RMSE = 150860.43
MSE = 22758867870.54
MAE = 89693.03
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.8123385827359519
R2 adjusted = 0.892887456488817
RMSE = 128246.42
MSE = 16447144366.04
MAE = 77009.67
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8723542421335018
R2 adjusted = 0.918108291512692
RMSE = 105769.62
MSE = 11187212789.67
MAE = 64905.27


#### The times that the model was affected negatively were when we removed grade, lat, yr renovated, sqft_lot15 and sqft_living_log

#### The times that the model was affected positively were when we removed: 1.2, 0.2, 1.1, long, floors, day_bought, zipcode, 0.3.

#### let's try now combination of this features to see how the combinations impact the model

In [234]:
X_num36 = df.drop(columns = ["price", "long", "zipcode"], axis = 1) # good FOR THE MODEL TO DO THIS

In [236]:
transformer = MinMaxScaler().fit(X_num36)
X_num_minmax = transformer.transform(X_num36)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num36.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 34)

In [237]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [238]:
#Test model
predictions = model.predict(X_test)
predictions.shape

(4656,)

In [239]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7257096715744052
R2 adjusted = 0.7283813545014244
RMSE = 155046.87
MSE = 24039531916.56
MAE = 104631.29
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7289490853805931
R2 adjusted = 0.7633783965474891
RMSE = 154128.59
MSE = 23755621098.29
MAE = 92512.52
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.7501538630729417
R2 adjusted = 0.863625189836814
RMSE = 147976.95
MSE = 21897178137.34
MAE = 84765.17
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8414507279971577
R2 adjusted = 0.900252728790504
RMSE = 117879.93
MSE = 13895678737.7
MAE = 71788.96


#### this combination affected positively the R2 of the model, in LR it is R2: 0.725

***

In [240]:
X_num37 = df.drop(columns = ["price", "long", "zipcode", "1.2", "0.2", "1.1", "floors", "day_bought", "0.3"], axis = 1) # good FOR THE MODEL TO DO THIS

In [241]:
transformer = MinMaxScaler().fit(X_num37)
X_num_minmax = transformer.transform(X_num37)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num37.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 28)

In [242]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [243]:
#Test model
predictions = model.predict(X_test)
predictions.shape

(4656,)

In [244]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7226427210168886
R2 adjusted = 0.7260127967992928
RMSE = 155911.28
MSE = 24308327598.26
MAE = 105341.29
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7719538243480916
R2 adjusted = 0.7983034655051086
RMSE = 141373.87
MSE = 19986571708.52
MAE = 84472.37
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.7488833656630871
R2 adjusted = 0.8628619665554511
RMSE = 148352.71
MSE = 22008527900.24
MAE = 84841.27
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8416212814501463
R2 adjusted = 0.899856086352938
RMSE = 117816.51
MSE = 13880730980.7
MAE = 71695.59


***

In [245]:
X_num38 = df.drop(columns = ["price", "long", "zipcode", "floors"], axis = 1) # good FOR THE MODEL TO DO THIS

In [246]:
transformer = MinMaxScaler().fit(X_num38)
X_num_minmax = transformer.transform(X_num38)
X_num_norm = pd.DataFrame(X_num_minmax,columns=X_num38.columns)
X_num_normalized = X_num_norm.copy()
X_num_normalized.shape

(18623, 33)

In [247]:
X = X_num_normalized.copy()
Y = df["price"]
#Separation between train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) 
#Train model
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [248]:
#Test model
predictions = model.predict(X_test)
predictions.shape

(4656,)

In [249]:
predictions, y_test, r2 = modeling(Y, X, models=[LinearRegression(), KNeighborsRegressor(n_neighbors=9), DecisionTreeRegressor(max_depth=10),RandomForestRegressor(n_estimators=100, max_depth=10)], test_size=0.24)

LinearRegression() metrics are: 
R2 = 0.7252713088044236
R2 adjusted = 0.72825975369992
RMSE = 155170.72
MSE = 24077951192.44
MAE = 104767.52
KNeighborsRegressor(n_neighbors=9) metrics are: 
R2 = 0.7280052467577964
R2 adjusted = 0.7639046766197952
RMSE = 154396.7
MSE = 23838341618.65
MAE = 92797.74
DecisionTreeRegressor(max_depth=10) metrics are: 
R2 = 0.7515142767154278
R2 adjusted = 0.8632280514413312
RMSE = 147573.53
MSE = 21777947877.33
MAE = 84911.77
RandomForestRegressor(max_depth=10) metrics are: 
R2 = 0.8406862360227441
R2 adjusted = 0.9002418828735713
RMSE = 118163.79
MSE = 13962680842.09
MAE = 71710.78


#### at this point we conclude the model perfoms better without zipcode and long