# Let's fit a model on the housing price data!
1. Import packages
2. Load Data
3. Fit a LR model


In [11]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [12]:
# Load Data
raw_data_train = pd.read_csv('https://raw.githubusercontent.com/jmpark0808/pl_mnist_example/main/train_hp_msci436.csv')
raw_data_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [13]:
print(raw_data_train.shape)

(1460, 81)


In [14]:
df = raw_data_train.select_dtypes(include = ['float64', 'int64']).fillna(0)
X = df.values[:, 1:-1]
y = df.values[:, -1]

In [15]:
# Fit LR Model
reg = LinearRegression().fit(X, y)
reg.coef_

array([-1.67658280e+02,  9.20432778e+00,  3.93560877e-01,  1.73308290e+04,
        5.09930597e+03,  3.38003523e+02,  1.22935734e+02,  2.82422748e+01,
        9.54952379e+00,  1.13264693e-01, -5.57819889e-01,  9.10496860e+00,
        1.84499656e+01,  1.98234116e+01, -9.95987321e+00,  2.83135040e+01,
        8.53883551e+03,  1.74267903e+03,  3.21905106e+03, -1.93820153e+03,
       -1.02852842e+04, -1.56345069e+04,  4.98509713e+03,  4.09733222e+03,
       -1.45465069e+01,  1.56588891e+04,  4.92609426e+00,  2.59348714e+01,
       -6.24696356e+00,  1.16619154e+01,  2.09608022e+01,  5.77467144e+01,
       -3.28769779e+01, -4.73430117e-01, -4.66005790e+01, -7.16852175e+02])

In [16]:
coef_df = pd.DataFrame({
                    'Features': df.columns[1:-1],
                    'Coefficients': reg.coef_
                })
#print(coef_df)

import numpy as np

# Calculate absolute values of coefficients
coef_df['abs_Coefficient'] = np.abs(coef_df['Coefficients'])

# Sort DataFrame by absolute values of coefficients in descending order
coef_df = coef_df.sort_values('abs_Coefficient', ascending=False)

# If you want, you can drop the 'abs_Coefficient' column after sorting
coef_df = coef_df.drop(columns=['abs_Coefficient'])

print(coef_df)
top_20_feature_names = coef_df['Features'].head(20).values
print(top_20_feature_names)

         Features  Coefficients
3     OverallQual  17330.829045
25     GarageCars  15658.889052
21   KitchenAbvGr -15634.506924
20   BedroomAbvGr -10285.284159
16   BsmtFullBath   8538.835507
4     OverallCond   5099.305966
22   TotRmsAbvGrd   4985.097130
23     Fireplaces   4097.332223
18       FullBath   3219.051063
19       HalfBath  -1938.201533
17   BsmtHalfBath   1742.679026
35         YrSold   -716.852175
5       YearBuilt    338.003523
0      MSSubClass   -167.658280
6    YearRemodAdd    122.935734
31    ScreenPorch     57.746714
34         MoSold    -46.600579
32       PoolArea    -32.876978
15      GrLivArea     28.313504
7      MasVnrArea     28.242275
27     WoodDeckSF     25.934871
30      3SsnPorch     20.960802
13       2ndFlrSF     19.823412
12       1stFlrSF     18.449966
24    GarageYrBlt    -14.546507
29  EnclosedPorch     11.661915
14   LowQualFinSF     -9.959873
8      BsmtFinSF1      9.549524
1     LotFrontage      9.204328
11    TotalBsmtSF      9.104969
28    Op

# Pick top 20

Manually picked top 22 features that are logical based on the order of descending weight(removed GarageYrBlt, ID, MiscVal within the top 22 because they are not relevant.) We originally decided to pick top 20, but because there are 2 other features that are also relevant, we decided to add it into the top features. After selecting the top features, we removed the remaining features including TotalBsmtSF, Fireplaces, OpenPorchSF, 2ndFlrSF, MasVnrArea, MSSubClass, BsmtUnfSF, WoodDeckSF, GarageCars, BsmtFinSF2, PoolArea, LotFrontage, BsmtFinSF1.

We also added SalePrice as this is what we wanted to predict.

1. SalePrice
2. LotArea - Integer area
3. BedroomAbvGr - Integer
4. HalfBath - Integer
5. GrLivArea - Integer
6. OverallQual - scale of 1 to 10
7. KitchenAbvGr - Integer
8. TotRmsAbvGrd - Integer
9. BsmtHalfBath - 0,1,2
10. FullBath - Integer
11. BsmtFullBath - Integer
12. MoSold - Integer (1 to 12)
13. OverallCond - Scale 1 to 10
15. YearBuilt - (integer year)
16. 3SsnPorch - Integer
18. ScreenPorch - Integer
19. LowQualFinSF - Integer
20. YearRemodAdd - Integer year
21. GarageArea - Integer
22. EnclosedPorch - Integer
23. 1stFlrSF - integer
24. 2ndFlrSF - integer
25. GarageCars - Integer

Ordered list of all features in descending order of coefficient:
1. LotArea
2. GarageYrBlt
3. BedroomAbvGr
4. HalfBath
5. GrLivArea
6. OverallQual
7. KitchenAbvGr
8. TotRmsAbvGrd
9. BsmtHalfBath
10. FullBath
11. BsmtFullBath
12. MoSold
13. OverallCond
14. Id
15. YearBuilt
16. 3SsnPorch
17. MiscVal
18. ScreenPorch
19. LowQualFinSF
20. YearRemodAdd
21. GarageArea
22. EnclosedPorch
23. 1stFlrSF
24. TotalBsmtSF
25. Fireplaces
26. OpenPorchSF
27. 2ndFlrSF
28. MasVnrArea
29. MSSubClass
30. BsmtUnfSF
31. WoodDeckSF
32. GarageCars
33. BsmtFinSF2
34. PoolArea
35. LotFrontage
36. BsmtFinSF1



In [17]:
top_20 = np.append(top_20_feature_names,'SalePrice')

df_new = df[top_20]

In [18]:
new_X = df_new.drop("SalePrice", axis = 1)
new_y = df_new["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(new_X, new_y, test_size = 0.3)

model = LinearRegression().fit(x_train, y_train)
print(model.coef_)
new_coef_df = pd.DataFrame({
                    'Features': df_new.columns[:-1],
                    'Coefficients': model.coef_
                })
print(new_coef_df)

# Calculate absolute values of coefficients
new_coef_df['abs_Coefficient'] = np.abs(new_coef_df['Coefficients'])

# Sort DataFrame by absolute values of coefficients in descending order
new_coef_df = new_coef_df.sort_values('abs_Coefficient', ascending=False)

# If you want, you can drop the 'abs_Coefficient' column after sorting
new_coef_df = new_coef_df.drop(columns=['abs_Coefficient'])

print(new_coef_df)
new_top_20_feature_names = new_coef_df['Features'].values
print(new_top_20_feature_names)
#new way:
model.predict(x_test)


[ 1.75360960e+04  1.13316039e+04 -2.03827224e+04 -9.11490275e+03
  1.63956988e+04  4.17132575e+03  4.50635180e+03  4.69910168e+03
  6.11072741e+03 -5.00125842e+03  6.41702121e+03 -5.08922642e+02
  3.72236243e+02 -2.07497752e+02  1.37192197e+02  5.11952378e+01
 -3.03924153e+02  1.35772245e+01  5.33500445e+01  4.34822433e+01]
        Features  Coefficients
0    OverallQual  17536.095981
1     GarageCars  11331.603852
2   KitchenAbvGr -20382.722418
3   BedroomAbvGr  -9114.902754
4   BsmtFullBath  16395.698782
5    OverallCond   4171.325751
6   TotRmsAbvGrd   4506.351801
7     Fireplaces   4699.101675
8       FullBath   6110.727405
9       HalfBath  -5001.258418
10  BsmtHalfBath   6417.021211
11        YrSold   -508.922642
12     YearBuilt    372.236243
13    MSSubClass   -207.497752
14  YearRemodAdd    137.192197
15   ScreenPorch     51.195238
16        MoSold   -303.924153
17      PoolArea     13.577224
18     GrLivArea     53.350044
19    MasVnrArea     43.482243
        Features  Coeff

array([133606.12782303,  78291.3022121 , 116498.50366188, 152922.80301378,
       294660.26947504, 148579.37078491,  36311.35477467, 214149.01815355,
       310482.89712168, 118497.22859843, 147970.92576882, 219417.81692775,
       270193.87839368, 256072.03266719, 318937.39617474, 206603.78829812,
       123643.36979498, 103493.0661974 , 301610.65990401,  89512.54329152,
       203198.76068084, 115790.55321201, 197408.48504551, 184064.69542115,
       113147.64324298, 264671.77938506, 304883.86488296, 195146.86966483,
       119699.47715898,  94367.13192622, 249288.32070504, 221828.29584426,
       220561.38862911, 325624.65941747, 261834.48696649,  89700.24395012,
       185288.75153061, 182129.62622904, 225999.24686848, 223843.21342118,
       399240.80859967, 148615.56503158, 155640.92035634, 326205.70490014,
        82945.09502831, 124671.08226503, 260465.25485458, 244838.95584116,
       208502.3656751 , 141274.61233664, 115178.12858092, 389782.60534304,
       204180.25591693, 1

In [19]:
model.score(x_test, y_test)

0.8378431082514118

In [20]:
import pickle
pickle.dump(model, open('./model.sav', 'wb'))