# **House Price Prediction Model**

In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [114]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [115]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [117]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [118]:
df.shape

(1460, 81)

In [119]:
X = df.drop(columns='SalePrice')
y = df['SalePrice']

In [120]:
numeric_columns = []
object_columns = []

for column in X.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        numeric_columns.append(column)
    elif pd.api.types.is_object_dtype(df[column]):
        object_columns.append(column)

print(len(numeric_columns), len(object_columns))

37 43


In [121]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [122]:
handle_numerical = Pipeline(steps=[
    ('impute_numerical',SimpleImputer(strategy='mean')),
    ('scaling_numerical',StandardScaler())
])

In [123]:
handle_categorical = Pipeline(steps=[
    ('handle_categorical',SimpleImputer(strategy='most_frequent')),
    ('encode_categorical',OrdinalEncoder())
])

In [124]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical',handle_numerical,numeric_columns),
    ('categorical',handle_categorical,object_columns)
],remainder='passthrough')

In [125]:
model = LinearRegression()

In [126]:
pipe = make_pipeline(preprocessing,model)

In [127]:
pipe.fit(X,y)

In [128]:
X_test = pd.read_csv('test.csv')

In [129]:
X_test.shape

(1459, 80)

In [130]:
sub = pd.read_csv('sample_submission.csv')
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [131]:
y_pred = pipe.predict(X_test)

In [132]:
id_df = X[['Id']]

y_pred_df = pd.DataFrame({'SalePrice': y_pred})

y_pred_final = pd.concat([id_df, y_pred_df], axis=1)

In [133]:
pd.DataFrame(y_pred_final)

Unnamed: 0,Id,SalePrice
0,1,106009.953704
1,2,160385.953704
2,3,163629.953704
3,4,186541.953704
4,5,189305.953704
...,...,...
1455,1456,57105.953704
1456,1457,135697.953704
1457,1458,116270.953704
1458,1459,244917.953704


In [134]:
y_pred_final.to_csv('Output.csv', index=False)

In [137]:
df1 = pd.read_csv('sample_submission.csv')

df2 = pd.read_csv('Output.csv')

if not df1.iloc[:, 0].equals(df2.iloc[:, 0]):
    df2.iloc[:, 0] = df1.iloc[:, 0]

    df2.to_csv('Output.csv', index=False)