In [1]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np

# Importing Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Importing Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import GradientBoostingRegressor , AdaBoostRegressor
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

# importing Warnings
import warnings
warnings.filterwarnings("ignore")

# Importing Logging Libraries
import logging
logging.basicConfig(level=logging.INFO, 
                    format="[%(asctime)s]: %(levelname)s: %(message)s",
                    filename="app.log",
                    filemode="a")




In [2]:
# Importing the dataset

D1 = 'https://raw.githubusercontent.com/anirudhajohare19/House_Prices_Prediction_MLModel/refs/heads/main/research/train.csv'
D2 = 'https://raw.githubusercontent.com/anirudhajohare19/House_Prices_Prediction_MLModel/refs/heads/main/research/test.csv'

Train = pd.read_csv(D1)
Test = pd.read_csv(D2)

Train.sample(frac=1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
153,154,20,RL,,13500,Pave,,Reg,Lvl,AllPub,...,0,,,,0,3,2008,WD,Normal,235000
763,764,60,RL,82.0,9430,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2009,WD,Normal,337000
923,924,120,RL,50.0,8012,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2008,WD,Normal,193000
1152,1153,20,RL,90.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2006,WD,Abnorml,230000
230,231,20,RL,73.0,8760,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,148000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,560,120,RL,,3196,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2006,WD,Normal,234000
166,167,20,RL,,10708,Pave,,IR1,Lvl,AllPub,...,0,,GdWo,,0,11,2009,COD,Normal,190000
1174,1175,70,RL,80.0,16560,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,239000
1173,1174,50,RL,138.0,18030,Pave,,IR1,Bnk,AllPub,...,0,,MnPrv,,0,3,2007,WD,Normal,200500


In [3]:
Train.shape

(1460, 81)

In [4]:
# checking the shape of the dataset
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Checking for missing values
missing = Train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64

In [6]:
# droping the columns with missing values
# Dropped from trianing and testing dataset
Train.drop(['PoolQC',"Alley", "Fence",'MiscFeature'], axis=1, inplace=True)
Test.drop(['PoolQC',"Alley", "Fence",'MiscFeature'], axis=1, inplace=True)

### 1. Dropping "PoolQC" and "MiscFeature" Features because they have more than 80% of missing values

In [7]:
# Fillinf the missing values with "None" 
# Training datset imputaition
none_cols = ['FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'MasVnrType']

for col in none_cols:
    Train[col] = Train[col].fillna("None")

# Testing datset imputaition
none_cols = ['FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'MasVnrType']

for col in none_cols:
    Test[col] = Test[col].fillna("None")


Categorical features with "NA means None".

Examples: 'GarageType', 'Masvnrtype','GarageCond', 'FireplaceQu'


These are not true missing values—they mean the house doesn't have that feature.

In [8]:
# Fill numeric columns with zero (indicating absence)
# Training datset imputaition
zero_fill_cols = ['GarageYrBlt', 'MasVnrArea']
for col in zero_fill_cols:
    Train[col] = Train[col].fillna(0)

# Testing datset imputaition
zero_fill_cols = ['GarageYrBlt', 'MasVnrArea']
for col in zero_fill_cols:
    Test[col] = Test[col].fillna(0)

### Numeric features where missing = 0 (feature doesn’t exist)

### Examples:

### GarageYrBlt (no garage)

### MasVnrArea (no masonry veneer)

In [9]:
# Impute with median for numeric features where missin
Train['LotFrontage'] = Train['LotFrontage'].fillna(Train['LotFrontage'].median())
Test['LotFrontage'] = Test['LotFrontage'].fillna(Test['LotFrontage'].median())


In [10]:
# Checking Summary statistics and Outliers for numeric features
from collections import OrderedDict

stats = []

# Descriptive statistics
for col in Train.columns:
    if Train[col].dtype != 'object':
        numerical_stats = OrderedDict({
            'Feature': col,
            'Minimum': Train[col].min(),
            'Maximum': Train[col].max(),
            'Mean': Train[col].mean(),
            'Mode': Train[col].mode()[0] if not Train[col].mode().empty else None,
            '25%': Train[col].quantile(0.25),
            '75%': Train[col].quantile(0.75),
            'IQR': Train[col].quantile(0.75) - Train[col].quantile(0.25),
            'Standard Deviation': Train[col].std(),
            'Skewness': Train[col].skew(),
            'Kurtosis': Train[col].kurt()
        })
        stats.append(numerical_stats)

# Convert to DataFrame
report = pd.DataFrame(stats)

# Outlier Identification :
outlier_label = []
for col in report['Feature']:
    Q1 = Train[col].quantile(0.25)
    Q3 = Train[col].quantile(0.75)
    IQR = Q3 - Q1
    LW = Q1 - 1.5 * IQR   # LW : Lower Whisker Line
    UW = Q3 + 1.5 * IQR   # UW : Upper Whisker Line
    outliers = Train[(Train[col] < LW) | (Train[col] > UW)]
    if not outliers.empty:
        outlier_label.append("Has Outliers")
    else:
        outlier_label.append("No Outliers")

report["Outlier Comment"] = outlier_label

# Checking Report
report

Unnamed: 0,Feature,Minimum,Maximum,Mean,Mode,25%,75%,IQR,Standard Deviation,Skewness,Kurtosis,Outlier Comment
0,Id,1.0,1460.0,730.5,1.0,365.75,1095.25,729.5,421.610009,0.0,-1.2,No Outliers
1,MSSubClass,20.0,190.0,56.89726,20.0,20.0,70.0,50.0,42.300571,1.407657,1.580188,Has Outliers
2,LotFrontage,21.0,313.0,69.863699,69.0,60.0,79.0,19.0,22.027677,2.409147,21.912954,Has Outliers
3,LotArea,1300.0,215245.0,10516.828082,7200.0,7553.5,11601.5,4048.0,9981.264932,12.207688,203.243271,Has Outliers
4,OverallQual,1.0,10.0,6.099315,5.0,5.0,7.0,2.0,1.382997,0.216944,0.096293,Has Outliers
5,OverallCond,1.0,9.0,5.575342,5.0,5.0,6.0,1.0,1.112799,0.693067,1.106413,Has Outliers
6,YearBuilt,1872.0,2010.0,1971.267808,2006.0,1954.0,2000.0,46.0,30.202904,-0.613461,-0.439552,Has Outliers
7,YearRemodAdd,1950.0,2010.0,1984.865753,1950.0,1967.0,2004.0,37.0,20.645407,-0.503562,-1.272245,No Outliers
8,MasVnrArea,0.0,1600.0,103.117123,0.0,0.0,164.25,164.25,180.731373,2.677616,10.141416,Has Outliers
9,BsmtFinSF1,0.0,5644.0,443.639726,0.0,0.0,712.25,712.25,456.098091,1.685503,11.118236,Has Outliers


In [11]:
# Replace Outliers with Median Statergy

'''
for col in Train.select_dtypes(include='number').columns:
    Q1 = Train[col].quantile(0.25)
    Q3 = Train[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (Train[col] < lower_bound) | (Train[col] > upper_bound)
    outlier_count = outliers.sum()

    if outlier_count > 0:
        replacement = Train[col].median()  
        Train.loc[outliers, col] = replacement
        print(f"Replaced {outlier_count} outliers in '{col}' with median.")
    else:
        print(f"No outliers found in '{col}'.")
'''

'\nfor col in Train.select_dtypes(include=\'number\').columns:\n    Q1 = Train[col].quantile(0.25)\n    Q3 = Train[col].quantile(0.75)\n    IQR = Q3 - Q1\n\n    lower_bound = Q1 - 1.5 * IQR\n    upper_bound = Q3 + 1.5 * IQR\n\n    # Identify outliers\n    outliers = (Train[col] < lower_bound) | (Train[col] > upper_bound)\n    outlier_count = outliers.sum()\n\n    if outlier_count > 0:\n        replacement = Train[col].median()  \n        Train.loc[outliers, col] = replacement\n        print(f"Replaced {outlier_count} outliers in \'{col}\' with median.")\n    else:\n        print(f"No outliers found in \'{col}\'.")\n'

In [12]:
# Using Lebel Encoding
from sklearn.preprocessing import LabelEncoder

# Combine train + test to ensure consistent label encoding
combined = pd.concat([Train.drop('SalePrice', axis=1), Test], axis=0)

# Identify categorical columns
cat_cols = combined.select_dtypes(include='object').columns

# Apply label encoding
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le


In [13]:
# Reassign to original train/test
X_train = combined.iloc[:Train.shape[0], :]
X_test = combined.iloc[Train.shape[0]:, :]

# Add back target column
y_train = Train['SalePrice']


#### X_train → fully preprocessed training features

#### y_train → target values

#### X_test → clean test data ready for prediction

In [14]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.30, random_state=42)


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(X_train_split, y_train_split)

# Predict on validation set
y_pred = lr.predict(X_val_split)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
print(f"Linear Regression RMSE: {rmse:.2f}")

from sklearn.metrics import r2_score

r2 = r2_score(y_val_split, y_pred)
print(f"Linear Regression R² Score: {r2:.4f}")
 

Linear Regression RMSE: 32595.55
Linear Regression R² Score: 0.8477


In [16]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_split, y_train_split)

y_pred_rf = rf.predict(X_val_split)
rmse_rf = np.sqrt(mean_squared_error(y_val_split, y_pred_rf))
print(f"Random Forest RMSE: {rmse_rf:.2f}")

r2 = r2_score(y_val_split, y_pred_rf)
print(f"Random Forest R² Score: {r2:.4f}")


Random Forest RMSE: 26428.13
Random Forest R² Score: 0.8999


In [17]:
#v Xgboost model
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train_split, y_train_split)

y_pred_xgb = xgb.predict(X_val_split)
rmse_xgb = np.sqrt(mean_squared_error(y_val_split, y_pred_xgb))

r2 = r2_score(y_val_split, y_pred_xgb)
print(f"XGBoost R² Score: {r2:.4f}")
print(f"XGBoost RMSE: {rmse_xgb:.2f}")



XGBoost R² Score: 0.9004
XGBoost RMSE: 26365.29


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Model Training
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train_split, y_train_split)

# Model Prediction
y_pred_xgb = xgb.predict(X_val_split)

# Evaluation Metrics
mse = mean_squared_error(y_val_split, y_pred_xgb)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_split, y_pred_xgb)
r2 = r2_score(y_val_split, y_pred_xgb)
mape = mean_absolute_percentage_error(y_val_split, y_pred_xgb) * 100  # In percentage

# Print Results
print(f"XGBoost Evaluation Metrics:")
print(f"----------------------------")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"R² Score: {r2:.4f}")

XGBoost Evaluation Metrics:
----------------------------
Mean Absolute Error (MAE): 17606.08
Mean Squared Error (MSE): 695128384.00
Root Mean Squared Error (RMSE): 26365.29
Mean Absolute Percentage Error (MAPE): 10.32%
R² Score: 0.9004
