# Predict the Price of House

In [180]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [181]:
df=pd.read_csv('F:\Dataset\house_prediction.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [182]:
print(df.shape)

(1460, 81)


In [183]:
X = df.drop(columns=["Id","SalePrice"])  
y = df["SalePrice"]


In [184]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [185]:
print("Shape of you train and target data part-->",X.shape,y.shape,sep="\n")

Shape of you train and target data part-->
(1460, 79)
(1460,)


# removiing null columns


In [186]:
null_data = X.isnull().sum()/X.shape[0]*100
null_columns = null_data[null_data>20].keys()
print(null_columns)



Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


In [187]:
X = X.drop(columns=null_columns) 
print(X.shape)

(1460, 74)


# finding missing numerical data

In [188]:
X_num = X.select_dtypes(include=["int64","float64"])
print(X_num.isnull().sum().sum())
num_nn = [var for var in X_num.columns if X_num[var].isnull().sum()>0]
print("numerical data which is missing ",num_nn)

348
numerical data which is missing  ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


# finding missing object data

In [189]:
X_char = X.select_dtypes(include=["object"])
print(X_char.isnull().sum().sum())
char_nn = [var for var in X_char.columns if X_char[var].isnull().sum()>0]
print("object data which is missing:",char_nn)

520
object data which is missing: ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']


# filling values

In [190]:
num_nn = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
char_nn = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']


In [191]:
mean_impute = Pipeline(steps=[("imputer",SimpleImputer(strategy="mean"))])
mode_impute = Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent"))])

In [192]:
transform_1 = ColumnTransformer(transformers=
                              [("num_mean",mean_impute,num_nn),("char_mode",mode_impute,char_nn)])

In [193]:
update_null = transform_1.fit_transform(X)
df_update_null = pd.DataFrame(update_null,columns=['LotFrontage', 'MasVnrArea', 'GarageYrBlt','MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'])

In [194]:
X.update(df_update_null)

In [195]:
print("Shape of train Data",X.shape,"\nShape of target Data",y.shape,sep="\n")

Shape of train Data
(1460, 74)

Shape of target Data
(1460,)


# encoding

In [196]:
lb = LabelEncoder()
k = np.array([lb.fit_transform(X[var]) for var in X_char.keys()])
k.resize((1460,38))
np = pd.DataFrame(k,columns=X_char.keys())
print("Shape of lb_df is-->",np.shape)
X.update(np)
print("Shape of X is-->",X.shape)


Shape of lb_df is--> (1460, 38)
Shape of X is--> (1460, 74)


## Linear Regression - ML Model

In [197]:
from sklearn.linear_model import LinearRegression

In [198]:
lr=LinearRegression()

In [201]:
X_train,X_test, Y_train,Y_test = train_test_split(X,y,test_size=.3)

In [202]:
lr.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [203]:
lr.intercept_

631682.0709731069

In [204]:
lr.coef_

array([-2.15560252e+02, -2.84985831e+02, -8.68639547e+01,  1.77293895e-01,
       -4.64720981e+02, -9.78257268e+02, -8.18845488e+02, -4.81836329e+02,
        4.20839820e+02,  4.46858904e+02,  1.17266072e+03, -1.56542333e+01,
       -3.01803764e+02, -5.76860720e+02,  1.25899999e+03,  1.73772179e+04,
        4.62786907e+03,  3.38689121e+02,  1.24622004e+02,  8.85559753e+02,
        4.76576989e+02, -2.14620580e+02, -1.33586532e+02, -5.77266759e+02,
        2.92434659e+01,  1.74425933e+02, -9.82083141e+02, -4.76607443e+02,
       -2.90940649e+01, -3.74186005e+02,  3.56304861e+02, -3.07308501e+02,
        4.77020136e+00, -8.54634854e+02,  5.74111861e+00, -2.32083906e+00,
        8.19048090e+00,  1.00804643e+03, -3.20356950e+02, -2.29451660e+02,
        4.36456442e+01,  1.02645294e+01,  1.70878533e+01,  7.01313631e+00,
        3.43655190e+01,  1.21699715e+04,  6.71259422e+03,  6.52273339e+03,
       -4.81238405e+03, -9.19446204e+03, -6.04802629e+03,  4.62625942e+02,
        4.42436042e+03, -

In [205]:
lr.predict(X_test)

array([256876.23126357, 115237.80779718, 238161.29514321, 160452.36478361,
       191785.07851606, 102594.89753466, 218018.70559884, 192579.5506758 ,
       310604.44306512, 197422.80190468, 199121.27710141, 143333.38691458,
       124088.23427869, 264277.11146699, 194199.04342527, 126211.53636944,
       200525.05617521, 289324.44912318, 155940.46837866, 252730.50437561,
       228668.51870523, 265083.75907163, 124784.24561374, 148398.95720186,
        94950.59421562, 123045.18945105, 148670.71825934, 268660.74127644,
       230461.02289729, 129794.76638149, 211275.62207859, 226354.55286413,
       143193.83396411, 324533.01156017,  67424.26432041, 164899.34704031,
       126708.91327427,  89152.12474688, 137998.63042154, 125564.7685011 ,
       117042.76260168, 129230.47274262, 119128.13018396, 217332.03749776,
       152294.12035569, 152235.98707853, 136232.34893197, 251833.75061931,
       136308.06422852, 192829.30302634, 137773.49136078, 159529.5387325 ,
       138402.72132594, 1

In [206]:
Y_test

602     220000
243     120000
1196    219210
892     154500
573     170000
         ...  
1291    119500
627     153000
374     219500
154     125000
293     235000
Name: SalePrice, Length: 438, dtype: int64

In [207]:
lr.score(X_test,Y_test)*100


83.17687473866022