<a href="https://www.kaggle.com/code/brmil07/house-prices-prediction?scriptVersionId=160870622" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Housing Prices Prediction**
---
## **Description**
---
This notebook aims to predict the housing prices based on the datased provided by "Housing Prices Competition for Kaggle Learn Users"

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

More info: [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/competitions/home-data-for-ml-course)

In [1]:
import numpy as np 
import pandas as pd 

from scipy.stats import uniform, randint

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Read the data
df_train = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
df_test = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col='Id')
x_train = df_train
x_test = df_test

# Remove rows with missing target, separate target from predictors
x_train.dropna(axis=0, subset=['SalePrice'], inplace=True)
y_train = x_train.SalePrice
x_train.drop(['SalePrice'], axis=1, inplace=True)

# **Exploratory Data Analysis**
---

In [2]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt    

In [3]:
x_train.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [4]:
x_train.shape

(1460, 79)

In [5]:
x_train.duplicated().sum()

0

In [6]:
x_train.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [7]:
x_train.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [8]:
columns_with_null = x_train.columns[x_train.isnull().any()]
print("Columns with null/missing values:")
print(columns_with_null)

Columns with null/missing values:
Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


In [9]:
missing_val_count_by_column = (x_train.isnull().sum())
column_list = missing_val_count_by_column[missing_val_count_by_column > 0]
print(column_list)

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [10]:
x_train.nunique()

MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
                 ... 
MiscVal            21
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
Length: 79, dtype: int64

# **Data Pre-Processing**
---

In [11]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(x_train, y_train, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = x_test[my_cols].copy()

In [12]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# **Create a Pipeline**
---

In [13]:
# Define model
model = RandomForestRegressor(n_estimators=1000, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
mse = mean_squared_error(y_valid, preds)
r2_percentage = r2_score(y_valid, preds) * 100

print('MAE:', mae)
print('MSE:', mse)
print('R-squared (%):', r2_percentage)

MAE: 17277.429907534246
MSE: 1103939079.217621
R-squared (%): 84.01443580483424


# **Explore other Classification Models**
---

In [14]:
# Create a list of available models
models = [
    RandomForestRegressor(n_estimators=100, random_state=0),
    AdaBoostRegressor(n_estimators=50, learning_rate=0.1),
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    LinearRegression(),
    XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    SVR(kernel='rbf', C=1.0, epsilon=0.1),
    KNeighborsRegressor(n_neighbors=5),
    Ridge(alpha=1.0),
    Lasso(alpha=1.0),
    DecisionTreeRegressor(max_depth=3),
    CatBoostRegressor(iterations=100, verbose=False)
]

model_name = ['RandomForestRegressor',
              'AdaBoostRegressor',
              'GradientBoostingRegressor', 
              'LinearRegression',
              'XGBRegressor',
              'LGBMRegressor', 
              'SVR',
              'KNeighborsRegressor', 
              'Ridge', 
              'Lasso',
              'DecisionTreeRegressor', 
              'CatBoostRegressor']

# Create a DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'MAE'])

In [15]:
# Loop through each model
for model, name in zip(models, model_name):
    # Bundle preprocessing and modeling code in a pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])

    # Preprocessing of training data, fit model
    clf.fit(X_train, y_train)

    # Preprocessing of validation data, get predictions
    preds = clf.predict(X_valid)

    # Calculate MAE
    mae = mean_absolute_error(y_valid, preds)

    # Append results to DataFrame
    results_df = results_df._append({'Model': name, 'MAE': mae}, ignore_index=True)

In [16]:
results_df['Rank'] = results_df['MAE'].rank(ascending=True, method='first').astype(int)
results_df = results_df.sort_values(by='Rank')

In [17]:
results_df

Unnamed: 0,Model,MAE,Rank
4,XGBRegressor,17054.506047,1
2,GradientBoostingRegressor,17123.721602,2
0,RandomForestRegressor,17614.819932,3
5,LGBMRegressor,17712.899045,4
11,CatBoostRegressor,18070.031077,5
8,Ridge,22419.783962,6
9,Lasso,23158.43335,7
3,LinearRegression,24072.48276,8
1,AdaBoostRegressor,26514.361794,9
10,DecisionTreeRegressor,29727.593444,10


In [18]:
# Perform cross-validation on the training data
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Display cross-validated scores
print('Cross-validated MAE scores:', -cv_scores)
print('Mean Cross-validated MAE:', -cv_scores.mean())

Cross-validated MAE scores: [14604.28308559 18564.68866041 17217.29945068 17327.8868361
 13499.00035437]
Mean Cross-validated MAE: 16242.631677426738


# **Improve the Performance of the Model**
---

In [19]:
# Find the best model object based on its name
best_model_index = model_name.index(results_df.iloc[0]['Model'])
best_model = models[best_model_index]

print(f"Best Model: {best_model}")

Best Model: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [20]:
# Define the parameter grid to search
param_dist = {'model__n_estimators': randint(50, 500),
              'model__learning_rate': uniform(0.01, 0.3),
              'model__max_depth': randint(3, 10),
              'model__min_samples_split': randint(1, 10),
              'model__min_samples_leaf': randint(1, 10),
              'model__subsample': uniform(0.6, 0.4)  # Adjust the range based on your preference
              # Add other hyperparameters to tune
             }

In [21]:
# Create a pipeline with the best model
best_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('model', best_model)])

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(best_model_pipeline, 
                                   param_distributions=param_dist,
                                   n_iter=50, 
                                   cv=5, 
                                   scoring='neg_mean_absolute_error', 
                                   n_jobs=-1, 
                                   random_state=0)
random_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model_tuned = random_search.best_estimator_

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parame

In [22]:
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'model__learning_rate': 0.051166126029158376, 'model__max_depth': 3, 'model__min_samples_leaf': 5, 'model__min_samples_split': 4, 'model__n_estimators': 357, 'model__subsample': 0.7533855576687591}


# **Create a Prediction for Test Dataset**
---

In [23]:
# Get predictions using the best model
preds_test = best_model_pipeline.predict(X_test)

In [24]:
# Use the best model to predict the test data
preds_test_tuned = best_model_tuned.predict(X_test)

In [25]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test_tuned})
output

Unnamed: 0,Id,SalePrice
0,1461,117167.085938
1,1462,163785.703125
2,1463,187035.718750
3,1464,188975.968750
4,1465,190169.281250
...,...,...
1454,2915,83641.828125
1455,2916,81679.382812
1456,2917,168182.609375
1457,2918,116973.031250


In [26]:
output.to_csv('submission.csv', index=False)