# Let's fit a model on the housing price data!
1. Import packages
2. Load Data
3. Fit a LR model


In [1]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# Load Data
raw_data_train = pd.read_csv('https://raw.githubusercontent.com/jmpark0808/pl_mnist_example/main/train_hp_msci436.csv')
raw_data_train.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [3]:
print(raw_data_train.shape)

(1460, 81)


In [4]:
# Create dataframe with only numerical values
df = raw_data_train.select_dtypes(include = ['float64', 'int64']).fillna(0)

import pickle
df.to_csv('raw_data.csv')

X = df.values[:, 1:-1]
y = df.values[:, -1]

In [5]:
# Fit LR Model
reg = LinearRegression().fit(X, y)
reg.coef_

array([-1.67658280e+02,  9.20432778e+00,  3.93560877e-01,  1.73308290e+04,
        5.09930597e+03,  3.38003523e+02,  1.22935734e+02,  2.82422748e+01,
        9.54952379e+00,  1.13264693e-01, -5.57819889e-01,  9.10496860e+00,
        1.84499656e+01,  1.98234116e+01, -9.95987321e+00,  2.83135040e+01,
        8.53883551e+03,  1.74267903e+03,  3.21905106e+03, -1.93820153e+03,
       -1.02852842e+04, -1.56345069e+04,  4.98509713e+03,  4.09733222e+03,
       -1.45465069e+01,  1.56588891e+04,  4.92609426e+00,  2.59348714e+01,
       -6.24696356e+00,  1.16619154e+01,  2.09608022e+01,  5.77467144e+01,
       -3.28769779e+01, -4.73430117e-01, -4.66005790e+01, -7.16852175e+02])

In [6]:
# Create dataframe for features and coefficients
coef_df = pd.DataFrame({
                    'Features': df.columns[1:-1],
                    'Coefficients': reg.coef_
                })

import numpy as np

# Calculate absolute values of coefficients
coef_df['abs_Coefficient'] = np.abs(coef_df['Coefficients'])

# Sort DataFrame by absolute values of coefficients in descending order
coef_df = coef_df.sort_values('abs_Coefficient', ascending=False)

# If you want, you can drop the 'abs_Coefficient' column after sorting
coef_df = coef_df.drop(columns=['abs_Coefficient'])

print(coef_df)
top_20_feature_names = coef_df['Features'].head(20).values
print(top_20_feature_names)

         Features  Coefficients
3     OverallQual  17330.829045
25     GarageCars  15658.889052
21   KitchenAbvGr -15634.506924
20   BedroomAbvGr -10285.284159
16   BsmtFullBath   8538.835507
4     OverallCond   5099.305966
22   TotRmsAbvGrd   4985.097130
23     Fireplaces   4097.332223
18       FullBath   3219.051063
19       HalfBath  -1938.201533
17   BsmtHalfBath   1742.679026
35         YrSold   -716.852175
5       YearBuilt    338.003523
0      MSSubClass   -167.658280
6    YearRemodAdd    122.935734
31    ScreenPorch     57.746714
34         MoSold    -46.600579
32       PoolArea    -32.876978
15      GrLivArea     28.313504
7      MasVnrArea     28.242275
27     WoodDeckSF     25.934871
30      3SsnPorch     20.960802
13       2ndFlrSF     19.823412
12       1stFlrSF     18.449966
24    GarageYrBlt    -14.546507
29  EnclosedPorch     11.661915
14   LowQualFinSF     -9.959873
8      BsmtFinSF1      9.549524
1     LotFrontage      9.204328
11    TotalBsmtSF      9.104969
28    Op

In [7]:
top_20 = np.append(top_20_feature_names,'SalePrice')

df_new = df[top_20]

In [8]:
new_X = df_new.drop("SalePrice", axis = 1)
new_y = df_new["SalePrice"]

x_train, x_test, y_train, y_test = train_test_split(new_X, new_y, test_size = 0.3)

model = LinearRegression().fit(x_train, y_train)
print(model.coef_)
new_coef_df = pd.DataFrame({
                    'Features': df_new.columns[:-1],
                    'Coefficients': model.coef_
                })
print(new_coef_df)

# Calculate absolute values of coefficients
new_coef_df['abs_Coefficient'] = np.abs(new_coef_df['Coefficients'])

# Sort DataFrame by absolute values of coefficients in descending order
new_coef_df = new_coef_df.sort_values('abs_Coefficient', ascending=False)

# If you want, you can drop the 'abs_Coefficient' column after sorting
new_coef_df = new_coef_df.drop(columns=['abs_Coefficient'])

print(new_coef_df)
new_top_20_feature_names = new_coef_df['Features'].values
print(new_top_20_feature_names)
#new way:
model.predict(x_test)


[ 17791.05138664  11547.13142981  -9060.60179408 -10553.92665578
  17458.81628021   4735.49952683   3154.43409478   5605.48788336
   6584.64118683  -2629.55487382   8246.16017462   -511.52068156
    334.4427455    -204.00131979    164.21128625     40.08856201
    -19.67932204    -45.18686777     54.56791329     37.14164108]
        Features  Coefficients
0    OverallQual  17791.051387
1     GarageCars  11547.131430
2   KitchenAbvGr  -9060.601794
3   BedroomAbvGr -10553.926656
4   BsmtFullBath  17458.816280
5    OverallCond   4735.499527
6   TotRmsAbvGrd   3154.434095
7     Fireplaces   5605.487883
8       FullBath   6584.641187
9       HalfBath  -2629.554874
10  BsmtHalfBath   8246.160175
11        YrSold   -511.520682
12     YearBuilt    334.442745
13    MSSubClass   -204.001320
14  YearRemodAdd    164.211286
15   ScreenPorch     40.088562
16        MoSold    -19.679322
17      PoolArea    -45.186868
18     GrLivArea     54.567913
19    MasVnrArea     37.141641
        Features  Coeff

array([221906.36041264,  90852.69141479, 130389.30701909, 289512.25948126,
       186289.85299205, 115499.81561742, 159457.69644755, 189046.60918224,
        88775.3319568 , 156590.37480594, 220548.97892347, 267082.73296079,
       253737.19107351, 108200.8501108 , 333411.85218395, 122349.7010836 ,
       222506.11059842, 166022.57795407, 197148.09098056, 112461.17653362,
       185575.49988416, 154278.9132805 ,  84521.61575403, 133361.17771299,
        99259.08022378,  63042.05861834, 166378.17571386, 212961.52979942,
        65708.49637927, 210466.65962044, 216123.94465846, 112580.6261747 ,
       152692.21563191, 235077.73837826, 210288.81250889, 112043.64687405,
       309121.12404349, 111271.52448199, 195826.72144825, 182213.51487328,
       254539.28751705, 196136.24787791,  58788.11792925, 147781.7516921 ,
       119863.88048916, 192528.61948681, 203658.36538109, 183464.23850087,
       212932.3153517 , 224383.47980788, 151812.99093359, 213129.11209001,
       186438.12721748, 1

In [9]:
# Print model prediction score
model.score(x_test, y_test)

0.8505290842183517

In [10]:
import pickle
pickle.dump(model, open('./model.sav', 'wb'))