In [11]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib  # For saving and loading models

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('dataset/train.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [5]:
X = df.drop(columns=['SalePrice'])

y = df['SalePrice']

In [6]:
# Identiy categorical and numerical columns

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist() 

In [7]:
numerical_cols

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [9]:
categorical_cols 

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [14]:
numerical_pipeline = Pipeline([
    ('Imputer', SimpleImputer(strategy='mean')), # Simple imputer for numerical data used for to impute the null values accordingly 
    ('Scaler', StandardScaler()) # Standard scaler for numerical data used for to scale the numerical data accordingly (Nomalize data)
])

In [15]:
categorical_pipeline = Pipeline([
    ('Imputer', SimpleImputer(strategy='most_frequent')), # most_frequent means mode
    ('ohe_encoder', OneHotEncoder())
])

In [18]:
# combine both pipelines using ColumnTransformer

preprocessor = ColumnTransformer([('numerical_cols', numerical_pipeline, numerical_cols),
                                ('categorical_cols', categorical_pipeline, categorical_cols)])

In [19]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor), # Preprocessor for the data
    ('model', LinearRegression()) # Model for the data
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
model_pipeline.fit(X_train, y_train)

In [22]:
# Save the entire pipeline (preprocessor + model)

joblib.dump(model_pipeline, 'houre_price_model_pipeline.pkl')

['houre_price_model_pipeline.pkl']

In [32]:
# load the pipeline and test prediction with a sample data (Input)

loaded_pipeline = joblib.load('houre_price_model_pipeline.pkl')

In [None]:
sample_input = X.iloc[0].to_dict()
sample_input_df = pd.DataFrame([sample_input])
sample_input_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal


In [37]:
predicted_price = loaded_pipeline.predict(sample_input_df)[0]

print('Predicted House Price:', predicted_price)

Predicted House Price: 209231.45149153259


In [38]:
# import numpy as np
# import pandas as pd
# import joblib

# # Load the saved model pipeline
# model_pipeline = joblib.load("house_price_model_pipeline.pkl")



# # Convert input into a DataFrame
# sample_input_df = pd.DataFrame([sample_input])

# # Predict house price
# predicted_price = model_pipeline.predict(sample_input_df)[0]
# print("Predicted House Price: ${:,.2f}".format(predicted_price)