<a href="https://www.kaggle.com/code/brmil07/house-prices-prediction?scriptVersionId=159495455" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Housing Prices Prediction**

## **Description**
This notebook aims to predict the housing prices based on the datased provided by "Housing Prices Competition for Kaggle Learn Users"

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

More info: [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/competitions/home-data-for-ml-course)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Read the data
df_train = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
df_test = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col='Id')
x_train = df_train
x_test = df_test

# Remove rows with missing target, separate target from predictors
x_train.dropna(axis=0, subset=['SalePrice'], inplace=True)
y_train = x_train.SalePrice
x_train.drop(['SalePrice'], axis=1, inplace=True)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Preparation and Analysis**

In [2]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt    

In [3]:
x_train.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [4]:
x_train.shape

(1460, 79)

In [5]:
x_train.duplicated().sum()

0

In [6]:
x_train.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [7]:
x_train.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

In [8]:
columns_with_null = x_train.columns[x_train.isnull().any()]
print("Columns with null/missing values:")
print(columns_with_null)

Columns with null/missing values:
Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')


In [9]:
missing_val_count_by_column = (x_train.isnull().sum())
column_list = missing_val_count_by_column[missing_val_count_by_column > 0]
print(column_list)

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [10]:
x_train.nunique()

MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
                 ... 
MiscVal            21
MoSold             12
YrSold              5
SaleType            9
SaleCondition       6
Length: 79, dtype: int64

# **Data Pre-Processing**

In [11]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(x_train, y_train, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = x_test[my_cols].copy()

In [12]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# **Create a Pipeline**

In [13]:
# Define model
model = RandomForestRegressor(n_estimators=1000, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

# Calculate MAE
mae = mean_absolute_error(y_valid, preds)
# Calculate R-squared and convert to percentage
r2_percentage = r2_score(y_valid, preds) * 100

print('MAE:', mae)
print('R-squared (%):', r2_percentage)

MAE: 17277.429907534246
R-squared (%): 84.01443580483424


# **Explore other Classification Models**

In [14]:
# Create a list of available models
models = [
    RandomForestRegressor(n_estimators=100, random_state=0),
    AdaBoostRegressor(n_estimators=50, learning_rate=0.1),
    GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    LinearRegression(),
    XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3),
    SVR(kernel='rbf', C=1.0, epsilon=0.1),
    KNeighborsRegressor(n_neighbors=5),
    Ridge(alpha=1.0),
    Lasso(alpha=1.0),
    DecisionTreeRegressor(max_depth=3),
    CatBoostRegressor(iterations=100, verbose=False)
]

model_name = ['RandomForestRegressor','AdaBoostRegressor',
              'GradientBoostingRegressor', 'LinearRegression',
              'XGBRegressor','LGBMRegressor', 'SVR',
              'KNeighborsRegressor', 'Ridge', 'Lasso',
              'DecisionTreeRegressor', 'CatBoostRegressor'
             ]

# Create a DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'MAE'])

In [15]:
# Loop through each model
for model, name in zip(models, model_name):
    # Bundle preprocessing and modeling code in a pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)])

    # Preprocessing of training data, fit model
    clf.fit(X_train, y_train)

    # Preprocessing of validation data, get predictions
    preds = clf.predict(X_valid)

    # Calculate MAE
    mae = mean_absolute_error(y_valid, preds)

    # Append results to DataFrame
    results_df = results_df._append({'Model': name, 'MAE': mae}, ignore_index=True)

# Print the results DataFrame
print(results_df)

                        Model           MAE
0       RandomForestRegressor  17614.819932
1           AdaBoostRegressor  25682.753102
2   GradientBoostingRegressor  17048.409292
3            LinearRegression  24072.482760
4                XGBRegressor  17054.506047
5               LGBMRegressor  17712.899045
6                         SVR  55543.091171
7         KNeighborsRegressor  30342.865753
8                       Ridge  22419.783962
9                       Lasso  23158.433350
10      DecisionTreeRegressor  29727.593444
11          CatBoostRegressor  18070.031077


# **Create a Prediction for Test dataset**

In [16]:
# Preprocessing of test data, fit model
preds_test = clf.predict(X_test)

In [17]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)