# Let's fit a model on the housing price data!
1. Import packages
2. Load Data
3. Fit a LR model


In [120]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [121]:
# Load Data
raw_data_train = pd.read_csv('https://raw.githubusercontent.com/jmpark0808/pl_mnist_example/main/train_hp_msci436.csv')
raw_data_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [122]:
print(raw_data_train.shape)

(1460, 81)


In [123]:
df = raw_data_train.select_dtypes(include = ['float64', 'int64']).fillna(0)
X = df.values[:, 1:-1]
y = df.values[:, -1]

In [124]:
# Fit LR Model
reg = LinearRegression().fit(X, y)
reg.coef_

array([-1.67658280e+02,  9.20432778e+00,  3.93560877e-01,  1.73308290e+04,
        5.09930597e+03,  3.38003523e+02,  1.22935734e+02,  2.82422748e+01,
        9.54952379e+00,  1.13264693e-01, -5.57819889e-01,  9.10496860e+00,
        1.84499656e+01,  1.98234116e+01, -9.95987321e+00,  2.83135040e+01,
        8.53883551e+03,  1.74267903e+03,  3.21905106e+03, -1.93820153e+03,
       -1.02852842e+04, -1.56345069e+04,  4.98509713e+03,  4.09733222e+03,
       -1.45465069e+01,  1.56588891e+04,  4.92609426e+00,  2.59348714e+01,
       -6.24696356e+00,  1.16619154e+01,  2.09608022e+01,  5.77467144e+01,
       -3.28769779e+01, -4.73430117e-01, -4.66005790e+01, -7.16852175e+02])

In [125]:
# Absolute values of the coefficients
abs_coef = np.abs(reg.coef_)

# Sort coef absolute values in descending order
sorted = np.argsort(abs_coef)[::-1]

# Ordered features in descending order of coefficient
ordered_features = sorted[:36]

# Pick top 20

Manually picked top 22 features that are logical based on the order of descending weight(removed GarageYrBlt, ID, MiscVal within the top 22 because they are not relevant.) We originally decided to pick top 20, but because there are 2 other features that are also relevant, we decided to add it into the top features. After selecting the top features, we removed the remaining features including TotalBsmtSF, Fireplaces, OpenPorchSF, 2ndFlrSF, MasVnrArea, MSSubClass, BsmtUnfSF, WoodDeckSF, GarageCars, BsmtFinSF2, PoolArea, LotFrontage, BsmtFinSF1.

We also added SalePrice as this is what we wanted to predict.

1. SalePrice
2. LotArea - Integer area
3. BedroomAbvGr - Integer
4. HalfBath - Integer
5. GrLivArea - Integer
6. OverallQual - scale of 1 to 10
7. KitchenAbvGr - Integer
8. TotRmsAbvGrd - Integer
9. BsmtHalfBath - 0,1,2
10. FullBath - Integer
11. BsmtFullBath - Integer
12. MoSold - Integer (1 to 12)
13. OverallCond - Scale 1 to 10
15. YearBuilt - (integer year)
16. 3SsnPorch - Integer
18. ScreenPorch - Integer
19. LowQualFinSF - Integer
20. YearRemodAdd - Integer year
21. GarageArea - Integer
22. EnclosedPorch - Integer
23. 1stFlrSF - integer
24. 2ndFlrSF - integer
25. GarageCars - Integer

Ordered list of all features in descending order of coefficient:
1. LotArea
2. GarageYrBlt
3. BedroomAbvGr
4. HalfBath
5. GrLivArea
6. OverallQual
7. KitchenAbvGr
8. TotRmsAbvGrd
9. BsmtHalfBath
10. FullBath
11. BsmtFullBath
12. MoSold
13. OverallCond
14. Id
15. YearBuilt
16. 3SsnPorch
17. MiscVal
18. ScreenPorch
19. LowQualFinSF
20. YearRemodAdd
21. GarageArea
22. EnclosedPorch
23. 1stFlrSF
24. TotalBsmtSF
25. Fireplaces
26. OpenPorchSF
27. 2ndFlrSF
28. MasVnrArea
29. MSSubClass
30. BsmtUnfSF
31. WoodDeckSF
32. GarageCars
33. BsmtFinSF2
34. PoolArea
35. LotFrontage
36. BsmtFinSF1



In [126]:
column_names = ['SalePrice','LotArea', 'BedroomAbvGr', 'HalfBath', 'GrLivArea', 'OverallQual', 'KitchenAbvGr',
                'TotRmsAbvGrd', 'BsmtHalfBath', 'FullBath', 'BsmtFullBath', 'MoSold', 'OverallCond',
                'YearBuilt', '3SsnPorch', 'ScreenPorch', 'LowQualFinSF', 'YearRemodAdd', 'GarageArea',
                'EnclosedPorch', '1stFlrSF', '2ndFlrSF', 'GarageCars']

df_new = df[column_names]


In [127]:
new_X = df_new.drop("SalePrice", axis = 1)
new_y = df_new["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(new_X, new_y, test_size = 0.3)

In [128]:
model = LinearRegression().fit(x_train, y_train)

In [129]:
model.predict(x_test)

array([121905.52377413, 167083.91609034, 194271.81290298,  77034.59302174,
       231788.63582076, 141615.34170288, 171399.60394077, 104617.37611892,
       157238.91046012, 153960.62620403, 292829.7326856 , 163566.52691017,
       141441.15792064, 131418.27844851, 194852.27446739, 236167.53503224,
       103230.78820971,  68841.19950651, 203580.44976735,  17263.66092543,
       217176.64250387,  57075.25661778, 232650.39835949, 135464.67650874,
       125132.09124727, 196258.03351082, 244909.21919799, 118221.7408017 ,
       267478.30485136, 282438.79553538, 208620.19452332, 139598.23318768,
       102812.8534369 , 282673.91758191, 239166.31522814, 201026.0814985 ,
       108325.43387945, 185221.35860179, 125059.11519491, 147687.92085185,
       182947.3795898 , 320337.32131844,  64195.56304239, 157242.22664527,
       304623.28713494, 211684.42062723, 114507.69030346, 111965.64401704,
       193959.99374567, 137555.32902054, 238298.29633779, 197246.76110023,
       103102.11536121, 2

In [130]:
model.score(x_test, y_test)

0.841750690908691

In [131]:
import pickle
pickle.dump(model, open('./model.sav', 'wb'))