###  House Price Prediction App 

In [1]:
# Imports and Configurations
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

#from category_encoders import MEstimateEncoder,CatBoostEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge 
from scipy import stats
from scipy.stats import norm, skew

In [2]:
# function to load data
def load_data(path):
    # Read data
    df = pd.read_csv(path, index_col="ID")
    #Encode data function
    df = encode(df)
    #Nan impute function
    df = impute(df)
    return df

In [3]:
def encode(df):
    #collect categorical features into a list
    cat_columns = df.dtypes[df.dtypes == "object"].index.to_list()
    
    #Use label encoder to encode the categorical features
    for c in cat_columns:
        label_encoder = LabelEncoder() 
        label_encoder.fit(list(df[c].values)) 
        df[c] = label_encoder.transform(list(df[c].values))
    return df

In [4]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

In [5]:
df_train = load_data('Housing_dataset_train.csv')

In [6]:
df_test = load_data('Housing_dataset_test.csv')

In [7]:
# Peek at the values
display(df_train.head())
display(df_test.head())

# Display information about dtypes and missing values
print(df_train.info())
print(df_test.info())

Unnamed: 0_level_0,loc,title,bedroom,bathroom,parking_space,price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3583,19,7,2.0,2.0,1.0,1149999.565
2748,27,0,0.0,2.0,4.0,1672416.689
9261,12,10,7.0,5.0,0.0,3364799.814
2224,3,3,5.0,2.0,4.0,2410306.756
10300,21,8,0.0,5.0,6.0,2600700.898


Unnamed: 0_level_0,loc,title,bedroom,bathroom,parking_space
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
845,18,6,4,1,2
1924,1,0,2,2,4
10718,1,1,2,7,2
12076,23,5,9,5,2
12254,14,7,5,6,1


<class 'pandas.core.frame.DataFrame'>
Index: 14000 entries, 3583 to 8787
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   loc            14000 non-null  int32  
 1   title          14000 non-null  int32  
 2   bedroom        14000 non-null  float64
 3   bathroom       14000 non-null  float64
 4   parking_space  14000 non-null  float64
 5   price          14000 non-null  float64
dtypes: float64(4), int32(2)
memory usage: 656.2 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 6000 entries, 845 to 11736
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   loc            6000 non-null   int32
 1   title          6000 non-null   int32
 2   bedroom        6000 non-null   int64
 3   bathroom       6000 non-null   int64
 4   parking_space  6000 non-null   int64
dtypes: int32(2), int64(3)
memory usage: 234.4 KB
None


Establishing a baseline

In [8]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    ## Label encoding is good for XGBoost and RandomForest.
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing price prediction is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",error_score="raise",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [9]:
X = df_train.copy()
y = X.pop("price")

baseline_score = score_dataset(X, y)
print(f"Baseline score: {baseline_score:.5f} RMSLE")

Baseline score: 0.20134 RMSLE


This baseline score helps us to know whether some set of features we've assembled has actually led to any improvement or not

### Feature Utility Scores
Use of mutual information to compute a utility score for a feature, giving you an indication of how much potential the feature has. I used, make_mi_scores and plot_mi_scores:

In [10]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [11]:
X = df_train.copy()
y = X.pop("price")

mi_scores = make_mi_scores(X, y)
mi_scores

bedroom          0.585583
title            0.314402
bathroom         0.270872
parking_space    0.217323
loc              0.144495
Name: MI Scores, dtype: float64

import optuna

def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
         )
    xgb = XGBRegressor(**xgb_params)
    return score_dataset(X, y, xgb)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
xgb_params = study.best_params

In [12]:
X_train, X_test = X, df_test
y_train = y

xgb = XGBRegressor(max_depth= 3,
                   learning_rate= 0.009913885869059601,
                   n_estimators= 4419,
                   min_child_weight= 5,
                   colsample_bytree= 0.6625613662993546,
                   subsample= 0.6896304470672804,
                   reg_alpha= 3.37564216178497,
                   reg_lambda= 0.0032953751479159097
                  )
# XGB minimizes MSE, but competition loss is RMSLE
# So, we need to log-transform y to train and exp-transform the predictions
xgb.fit(X_train, np.log(y))
predictions = np.exp(xgb.predict(X_test))

output = pd.DataFrame({'ID': X_test.index, 'price': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [13]:
import pickle
pickle.dump(xgb, open('model_5.pkl','wb'))
          

In [14]:
with open('model_5.pkl', 'rb') as file:
    # Load the data from the pickle file
    model_classifier = pickle.load(file)

In [30]:
r_test_dict =pd.DataFrame({'loc':[19],'title':[5],'bedroom':[5],'bathroom':[5], 'parking_space':[1]},)


r_test_dict

Unnamed: 0,loc,title,bedroom,bathroom,parking_space
0,19,5,5,5,1


In [31]:
#r_test_dict = [loc=19,title=7,bedroom=2,bathroom=2, parking_space=1]
r_test = pd.DataFrame(r_test_dict)
predi =np.exp(model_classifier.predict(r_test))
predi

array([3446868.], dtype=float32)

In [32]:
predi =np.exp(model_classifier.predict(r_test_dict))
predi

array([3446868.], dtype=float32)

In [None]:
predi[0]