In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Setting up matplotlib
%matplotlib inline
plt.rc("figure", dpi=100)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [2]:
file_path_train = "./Bangalore_house_prices/train.csv"
file_path_test = "./Bangalore_house_prices/test.csv"
df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5132 entries, 0 to 5131
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   area_type       5132 non-null   object 
 1   availability    5132 non-null   object 
 2   location        5132 non-null   object 
 3   total_sqft      5132 non-null   float64
 4   bath            5132 non-null   int64  
 5   balcony         5132 non-null   int64  
 6   price           5132 non-null   float64
 7   latitude        5132 non-null   float64
 8   longitude       5132 non-null   float64
 9   bhk             5132 non-null   int64  
 10  price_per_sqft  5132 non-null   float64
dtypes: float64(5), int64(3), object(3)
memory usage: 441.2+ KB


In [3]:
# Dropping columns due to low MI scores
df_train.drop(["area_type", "availability", "balcony", "price_per_sqft", "latitude", "longitude"], axis=1, inplace=True)
df_test.drop(["area_type", "availability", "balcony", "price_per_sqft", "latitude", "longitude"], axis=1, inplace=True)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5132 entries, 0 to 5131
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    5132 non-null   object 
 1   total_sqft  5132 non-null   float64
 2   bath        5132 non-null   int64  
 3   price       5132 non-null   float64
 4   bhk         5132 non-null   int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 200.6+ KB


In [5]:
df_train = df_train[df_train.location != "other"]

In [6]:
df_train.tail()

Unnamed: 0,location,total_sqft,bath,price,bhk
4132,YESHWANTHPUR,672.0,1,36.85,1
4133,YESHWANTHPUR,1693.0,3,108.0,3
4134,YESHWANTHPUR,1713.0,3,110.0,3
4135,YESHWANTHPUR,674.0,1,36.85,1
4136,YESHWANTHPUR,673.0,1,36.85,1


In [7]:
df_train.location.nunique()

191

In [8]:
df_test = df_test[df_test.location != "other"]

In [9]:
df_test.tail()

Unnamed: 0,location,total_sqft,bath,price,bhk
982,YESHWANTHPUR,770.0,1.0,70.0,2.0
983,YESHWANTHPUR,668.0,1.0,36.85,1.0
984,YESHWANTHPUR,667.0,1.0,36.85,1.0
985,YESHWANTHPUR,1385.0,2.0,76.18,3.0
986,YESHWANTHPUR,1541.0,2.0,130.0,2.0


In [10]:
X_train = df_train.copy()
y_train = X_train.pop("price")

X_test = df_test.copy()
y_test = X_test.pop("price")

In [11]:
X_train.head()

Unnamed: 0,location,total_sqft,bath,bhk
0,1ST PHASE JP NAGAR,1875.0,3,3
1,1ST PHASE JP NAGAR,1590.0,3,3
2,1ST PHASE JP NAGAR,1566.0,2,2
3,1ST PHASE JP NAGAR,2065.0,4,3
4,1ST PHASE JP NAGAR,1394.0,2,2


In [12]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train.location.to_frame()))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test.location.to_frame()))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# One-hot encoding removed column names
# OH_cols_train.columns = OH_encoder.get_feature_names_out()

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop("location", axis=1)
num_X_test = X_test.drop("location", axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [13]:
OH_X_train.head()

Unnamed: 0,total_sqft,bath,bhk,0,1,2,3,4,5,6,...,181,182,183,184,185,186,187,188,189,190
0,1875.0,3,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1590.0,3,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1566.0,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2065.0,4,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1394.0,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
y_train.head()

0    167.0
1    131.0
2    180.0
3    210.0
4     85.0
Name: price, dtype: float64

## Creating Pipeline

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Getting numerical and categorical columns
numerical_cols = X_train.select_dtypes(exclude=["object"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Model Building

---

### Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

# Bundle preprocessing and modeling code in a pipeline
lr_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lr_model)
])

lr_pipe.fit(X_train, y_train)

In [17]:
pd.Series({
    "Train Score": lr_pipe.score(X_train, y_train),
    "Test Score": lr_pipe.score(X_test, y_test)
})

Train Score    0.864645
Test Score     0.790460
dtype: float64

In [18]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
lr_cv_score = cross_val_score(lr_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", lr_cv_score, "\n",
    "Cross Validation Mean Score \t", lr_cv_score.mean(), sep=""
)

Cross Validation Score 		[0.83154931 0.88158449 0.86728667 0.78621933 0.86571685]
Cross Validation Mean Score 	0.8464713287043215


---

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor()

# Bundle preprocessing and modeling code in a pipeline
dt_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dt_model)
])

dt_pipe.fit(X_train, y_train)

In [20]:
pd.Series({
    "Train Score": dt_pipe.score(X_train, y_train),
    "Test Score": dt_pipe.score(X_test, y_test)
})

Train Score    0.997646
Test Score     0.890134
dtype: float64

In [21]:
dt_cv_score = cross_val_score(dt_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", dt_cv_score, "\n",
    "Cross Validation Mean Score \t", dt_cv_score.mean(), sep=""
)

Cross Validation Score 		[0.80136546 0.85213575 0.83153919 0.66637307 0.86501596]
Cross Validation Mean Score 	0.803285887189919


---

### Random Forest

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

# Bundle preprocessing and modeling code in a pipeline
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

rf_pipe.fit(X_train, y_train)

In [23]:
pd.Series({
    "Train Score": rf_pipe.score(X_train, y_train),
    "Test Score": rf_pipe.score(X_test, y_test)
})

Train Score    0.962523
Test Score     0.873224
dtype: float64

In [24]:
rf_cv_score = cross_val_score(rf_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", rf_cv_score, "\n",
    "Cross Validation Mean Score \t", rf_cv_score.mean(), sep=""
)

Cross Validation Score 		[0.7781805  0.84256183 0.85730475 0.62747576 0.89454829]
Cross Validation Mean Score 	0.800014228853595


---

### XGBoost

In [25]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

bundle = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

bundle.fit(X_train)
eval_test = bundle.transform(X_test)

xgb_pipe.fit(X_train, y_train)

  from pandas import MultiIndex, Int64Index


In [26]:
pd.Series({
    "Train Score": xgb_pipe.score(X_train, y_train),
    "Test Score": xgb_pipe.score(X_test, y_test)
})

Train Score    0.975391
Test Score     0.931105
dtype: float64

In [27]:
xgb_cv_score = cross_val_score(xgb_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", xgb_cv_score, "\n",
    "Cross Validation Mean Score \t", xgb_cv_score.mean(), sep=""
)

Cross Validation Score 		[0.85987147 0.88608928 0.89543582 0.61334976 0.92192242]
Cross Validation Mean Score 	0.8353337495351223


In [28]:
from sklearn.model_selection import KFold, cross_val_score

def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    #
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    # for colname in X.select_dtypes(["category"]):
    #     X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    # log_y = np.log(y)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error",
    )
    score = -1 * score.mean()
    # score = np.sqrt(score)
    return score

In [29]:
xgb_params = {'learning_rate': 0.1956691968239028, 'n_estimators': 223}

In [30]:
# xgb_params = {'learning_rate': 0.15440923827862846, 'n_estimators': 174}

In [31]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(**xgb_params))
])

model.fit(X_train, y_train,
          model__early_stopping_rounds=100,
          model__eval_set=[(eval_test, y_test)],
          model__verbose=False)

In [32]:
model.score(X_test, y_test)

0.9342026947865936

---

## Exporting Model Pipeline as Pickle

In [33]:
with open("model.pkl", "wb") as f:
    pickle.dump(xgb_pipe, f)

---

In [34]:
df_train.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1ST PHASE JP NAGAR,1875.0,3,167.0,3
1,1ST PHASE JP NAGAR,1590.0,3,131.0,3
2,1ST PHASE JP NAGAR,1566.0,2,180.0,2
3,1ST PHASE JP NAGAR,2065.0,4,210.0,3
4,1ST PHASE JP NAGAR,1394.0,2,85.0,2
5,1ST PHASE JP NAGAR,1077.0,2,93.0,2
6,1ST PHASE JP NAGAR,2077.0,3,175.0,3
7,1ST PHASE JP NAGAR,1394.0,2,100.0,2
8,1ST PHASE JP NAGAR,1180.0,2,88.5,2
9,1ST PHASE JP NAGAR,1200.0,2,86.0,2


In [41]:
house = ["1ST PHASE JP NAGAR", 1000, 2, 2]
hdf = pd.DataFrame({
    "location"   : house[0],
    "total_sqft" : house[1],
    "bath"       : house[2],
    "bhk"        : house[3]
}, index=[5])

In [42]:
lr_pipe.predict(hdf)

array([86.14556225])

In [43]:
dt_pipe.predict(hdf)

array([75.])

In [44]:
rf_pipe.predict(hdf)

array([64.13780476])

In [45]:
xgb_pipe.predict(hdf)

array([73.35842], dtype=float32)

In [46]:
hdf

Unnamed: 0,location,total_sqft,bath,bhk
5,1ST PHASE JP NAGAR,1000,2,2
