# **House Price Prediction Model**

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [41]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [42]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [44]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [45]:
df.shape

(1460, 81)

In [46]:
df['Age'] = df['YrSold'] - df['YearBuilt']
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['Interaction'] = df['GrLivArea'] * df['OverallQual']

In [47]:
df.drop(columns=['YrSold','YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'OverallQual'], inplace=True)

In [48]:
df.shape

(1460, 77)

In [49]:
def cap_outliers(df):
    for column in df.select_dtypes(include='number').columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

df = cap_outliers(df)

In [50]:
X = df.drop(columns=['Id', 'SalePrice'])
y = df['SalePrice']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Id','SalePrice']), 
                                                    df['SalePrice'], 
                                                    test_size=0.3, 
                                                    random_state=42)

In [52]:
numeric_columns = X_train.select_dtypes(include=['number']).columns.tolist()
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [53]:
handle_numerical = Pipeline(steps=[
    ('impute_numerical', KNNImputer(n_neighbors=3,weights='distance')),
    ('scaling_numerical', RobustScaler())
])

In [54]:
handle_categorical = Pipeline(steps=[
    ('handle_categorical', SimpleImputer(strategy='most_frequent')),
    ('encode_categorical', OneHotEncoder(handle_unknown='ignore'))
])

In [55]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numeric_columns),
    ('categorical', handle_categorical, object_columns)
], remainder='passthrough')

In [56]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor()
}

In [57]:
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessing), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - MAE: {mae}, RMSE: {rmse}, R2 Score: {r2}')

Linear Regression - MAE: 14400.414312423067, RMSE: 20717.762170436486, R2 Score: 0.908152975484075


  model = cd_fast.sparse_enet_coordinate_descent(


Lasso - MAE: 14125.773468845979, RMSE: 20258.451264144544, R2 Score: 0.9121803126127527
Random Forest - MAE: 13433.983207762556, RMSE: 19029.781728854057, R2 Score: 0.9225097577103216
Gradient Boosting - MAE: 12923.610649817443, RMSE: 18443.07265882232, R2 Score: 0.9272143173934813
XGBoost - MAE: 13709.517239226598, RMSE: 19344.81966531815, R2 Score: 0.9199228190188146


In [58]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 6],
    'model__learning_rate': [0.01, 0.1]
}

In [59]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessing), ('model', XGBRegressor())])
grid_search = GridSearchCV(xgb_pipeline, param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

In [60]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Best XGBoost Model - MAE: {mae}, RMSE: {rmse}, R2 Score: {r2}')

Best XGBoost Model - MAE: 12691.887922731165, RMSE: 18303.00578469134, R2 Score: 0.9283156686187746


In [61]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(best_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
print(f'Cross-validated MAE: {-cv_results.mean()}')

Cross-validated MAE: 13507.999004708903


In [62]:
gb_model = Pipeline(steps=[('preprocessor', preprocessing), ('model', GradientBoostingRegressor())])
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
print('Gradient Boosting - MAE:', mean_absolute_error(y_test, y_pred), 
    'RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)),
    'R2 Score:', r2_score(y_test, y_pred))

Gradient Boosting - MAE: 12917.324265050502 RMSE: 18445.511837831644 R2 Score: 0.9271950636549605


In [63]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [64]:
test_df['Age'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']
test_df['Interaction'] = test_df['GrLivArea'] * test_df['OverallQual']

In [65]:
test_df.drop(columns=['YrSold','YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'OverallQual'], inplace=True)

In [66]:
test_df = cap_outliers(test_df)

In [67]:
y_pred_submission = best_model.predict(test_df.drop(columns=['Id']))

In [68]:
test_df['Id'] = test_df['Id'].astype('int64')

In [69]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,Age,TotalSF,Interaction
0,1461,20.0,RH,80.0,11622.0,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,0.0,6.0,WD,Normal,49.0,1778.0,4480.0
1,1462,20.0,RL,81.0,14267.0,Pave,,IR1,Lvl,AllPub,...,,,Gar2,0.0,6.0,WD,Normal,52.0,2658.0,7974.0
2,1463,60.0,RL,74.0,13830.0,Pave,,IR1,Lvl,AllPub,...,,MnPrv,,0.0,3.0,WD,Normal,13.0,2557.0,8145.0
3,1464,60.0,RL,78.0,9978.0,Pave,,IR1,Lvl,AllPub,...,,,,0.0,6.0,WD,Normal,12.0,2530.0,9624.0
4,1465,120.0,RL,43.0,5005.0,Pave,,IR1,HLS,AllPub,...,,,,0.0,1.0,WD,Normal,18.0,2560.0,10240.0


In [70]:
submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_pred_submission})
submission_df.to_csv('submission.csv', index=False)