In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Setting up matplotlib
%matplotlib inline
plt.rc("figure", dpi=100)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [2]:
file_path_train = "./Bangalore_house_prices/train.csv"
file_path_test = "./Bangalore_house_prices/test.csv"
df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5132 entries, 0 to 5131
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   area_type       5132 non-null   object 
 1   availability    5132 non-null   object 
 2   location        5132 non-null   object 
 3   total_sqft      5132 non-null   float64
 4   bath            5132 non-null   int64  
 5   balcony         5132 non-null   int64  
 6   price           5132 non-null   float64
 7   latitude        5132 non-null   float64
 8   longitude       5132 non-null   float64
 9   bhk             5132 non-null   int64  
 10  price_per_sqft  5132 non-null   float64
dtypes: float64(5), int64(3), object(3)
memory usage: 441.2+ KB


In [3]:
# Dropping columns due to low MI scores
df_train.drop(["area_type", "availability", "balcony", "price_per_sqft"], axis=1, inplace=True)
df_test.drop(["area_type", "availability", "balcony", "price_per_sqft"], axis=1, inplace=True)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5132 entries, 0 to 5131
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    5132 non-null   object 
 1   total_sqft  5132 non-null   float64
 2   bath        5132 non-null   int64  
 3   price       5132 non-null   float64
 4   latitude    5132 non-null   float64
 5   longitude   5132 non-null   float64
 6   bhk         5132 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 280.8+ KB


In [5]:
df_train.location.nunique()

192

In [6]:
X_train = df_train.copy()
y_train = X_train.pop("price")

X_test = df_test.copy()
y_test = X_test.pop("price")

In [18]:
X_train.head(50)

Unnamed: 0,location,total_sqft,bath,latitude,longitude,bhk
0,1ST PHASE JP NAGAR,1875.0,3,12.9165,77.592246,3
1,1ST PHASE JP NAGAR,1590.0,3,12.9165,77.592246,3
2,1ST PHASE JP NAGAR,1566.0,2,12.9165,77.592246,2
3,1ST PHASE JP NAGAR,2065.0,4,12.9165,77.592246,3
4,1ST PHASE JP NAGAR,1394.0,2,12.9165,77.592246,2
5,1ST PHASE JP NAGAR,1077.0,2,12.9165,77.592246,2
6,1ST PHASE JP NAGAR,2077.0,3,12.9165,77.592246,3
7,1ST PHASE JP NAGAR,1394.0,2,12.9165,77.592246,2
8,1ST PHASE JP NAGAR,1180.0,2,12.9165,77.592246,2
9,1ST PHASE JP NAGAR,1200.0,2,12.9165,77.592246,2


In [8]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train.location.to_frame()))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test.location.to_frame()))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# One-hot encoding removed column names
# OH_cols_train.columns = OH_encoder.get_feature_names_out()

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop("location", axis=1)
num_X_test = X_test.drop("location", axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [9]:
OH_X_train.head()

Unnamed: 0,total_sqft,bath,latitude,longitude,bhk,0,1,2,3,4,...,182,183,184,185,186,187,188,189,190,191
0,1875.0,3,12.9165,77.592246,3,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1590.0,3,12.9165,77.592246,3,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1566.0,2,12.9165,77.592246,2,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2065.0,4,12.9165,77.592246,3,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1394.0,2,12.9165,77.592246,2,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
y_train.head()

0    167.0
1    131.0
2    180.0
3    210.0
4     85.0
Name: price, dtype: float64

## Model Building

In [14]:
from xgboost import XGBRegressor

In [13]:
from sklearn.model_selection import KFold, cross_val_score

def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    #
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    # for colname in X.select_dtypes(["category"]):
    #     X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    # log_y = np.log(y)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_absolute_error",
    )
    score = -1 * score.mean()
    # score = np.sqrt(score)
    return score

In [113]:
import optuna

def objective(trial):
    xgb_params = dict(
        learning_rate=trial.suggest_float("learning_rate", 5e-2, 2e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 300)
    )
    xgb = XGBRegressor(**xgb_params)
    xgb.fit(OH_X_train, y_train, early_stopping_rounds=100, eval_set=[(OH_X_test, y_test)], verbose=False)
    return xgb.score(OH_X_test, y_test)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
xgb_params = study.best_params

[32m[I 2022-06-09 17:45:19,104][0m A new study created in memory with name: no-name-0ed1a5db-968e-48c6-b412-a3b2f5608e8c[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[32m[I 2022-06-09 17:45:20,377][0m Trial 0 finished with value: 0.9121014367653817 and parameters: {'learning_rate': 0.11665940797414798, 'n_estimators': 212}. Best is trial 0 with value: 0.9121014367653817.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[32m[I 2022-06-09 17:45:21,556][0m Trial 1 finished with value: 0.9122238621506651 and parameters: {'learning_rate': 0.18078120278390894, 'n_estimators': 228}. Best is trial 1 with value: 0.9122238621506651.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[32m[I 2022-06-09 17:45:23,069][0m Trial 2 finished with value: 0.9114211438278232 and parameters: {'learning_rate': 0.05563435151211955, 'n_estimators': 244}. Best is trial 1 with value: 0.9122238621506651.[0m
  elif isinstance(data.columns, (pd.

In [12]:
xgb_params

{'learning_rate': 0.15440923827862846, 'n_estimators': 174}

In [15]:
model = XGBRegressor(**xgb_params)
model.fit(OH_X_train, y_train, early_stopping_rounds=100, eval_set=[(OH_X_test, y_test)], verbose=False)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [16]:
model.score(OH_X_test, y_test)

0.9143093553768065

In [19]:
model2 = XGBRegressor()
model2.fit(OH_X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [20]:
model2.score(OH_X_test, y_test)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.9027624719086609

In [25]:
from sklearn.ensemble import RandomForestRegressor

model3 = RandomForestRegressor()
model3.fit(OH_X_train, y_train)



In [26]:
model3.score(OH_X_test, y_test)



0.8753790263731416

In [None]:
from sklearn.svm import SVR

model4 = SVR(kernel='linear')
model4.fit(OH_X_train, y_train)



In [28]:
model4.score(OH_X_test, y_test)



0.40209936535178536