<a href="https://colab.research.google.com/github/arpitJaiswal550/House-Price-Prediction-Challenge--By-MachineHack/blob/main/House_Price_Pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# Essentials
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_log_error

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

In [2]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import io
train = pd.read_csv('/content/drive/MyDrive/House Price Pred Dataset/Train.csv')
print(train.shape)
train.head()

(29451, 12)


Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,55.0
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,60.5


In [4]:
test = pd.read_csv('/content/drive/MyDrive/House Price Pred Dataset/Test.csv')
print(test.shape)
test.head()

(68720, 11)


Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,Owner,0,0,1,BHK,545.17134,1,1,"Kamrej,Surat",21.262,73.0477
1,Dealer,1,1,2,BHK,800.0,0,0,"Panvel,Lalitpur",18.966114,73.148278
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.5922,88.484911
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.9883,75.5846
4,Owner,0,0,1,BHK,430.47783,1,1,"Mai Mandir,Nadiad",22.7,72.87


In [5]:
sample = pd.read_csv('/content/drive/MyDrive/House Price Pred Dataset/sample_submission.csv')
print(sample.shape)
sample.head()

(68720, 1)


Unnamed: 0,TARGET(PRICE_IN_LACS)
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


Preprocessing

In [7]:
# log(1+x) transform
train["TARGET(PRICE_IN_LACS)"] = np.log1p(train["TARGET(PRICE_IN_LACS)"])

In [8]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df.shape

(98171, 12)

In [9]:
df['SQUARE_FT'] = np.log(df['SQUARE_FT'])

In [10]:
df['POSTED_BY'].replace( {'Owner' : 0, 'Builder' : 1, 'Dealer' : 2}, inplace = True)
df['BHK_OR_RK'].replace( {'BHK':1, 'RK': 0}, inplace = True)

df['BHK_NO.'].replace( {16:15, 17:15, 18:15, 31:20, 13:11}, inplace= True)

In [11]:
df['POSTED_BY'] = df['POSTED_BY'].astype('uint8')
df['UNDER_CONSTRUCTION'] = df['UNDER_CONSTRUCTION'].astype('bool')
df['RERA'] = df['RERA'].astype('bool')
df['BHK_NO.'] = df['BHK_NO.'].astype('int8')
df['BHK_OR_RK'] = df['BHK_OR_RK'].astype('bool')
df['READY_TO_MOVE'] = df['READY_TO_MOVE'].astype('bool')
df['RESALE'] = df['RESALE'].astype('bool')

Feature Engineering

In [12]:
df['City'] = df['ADDRESS'].apply(lambda x: x.split(',')[-1])

In [13]:
df['Address'] = df['ADDRESS'].apply(lambda x: x.split(',')[0])

In [14]:
df.drop('ADDRESS', axis = 1, inplace=True)

Grouping features

In [15]:
df['median_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('median')
df['min_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('min')
df['max_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('max')

In [16]:
df['mean_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('median')
df['min_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('min')
df['max_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('max')

In [17]:
df['median_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('median')
df['min_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('min')
df['max_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('max')

In [18]:
df['median_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('median')
df['min_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('min')
df['max_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('max')

In [19]:
df['sqft_per_room'] = df.apply(lambda x: x['SQUARE_FT']/x['BHK_NO.'], axis = 1)

In [20]:
train['City'] = train['ADDRESS'].apply(lambda x: x.split(',')[-1])
test['City'] = test['ADDRESS'].apply(lambda x: x.split(',')[-1])

In [21]:
a = np.array(train['City'].unique())
b = np.array(test['City'].unique())

In [22]:
city_replace = {'Alappuzha' : 'Kochi','Amreli' : 'Rajkot','Azamgarh' : 'Varanasi','Barmer' : 'Jodhpur','Barnala' : 'Ludhiana',
                'Bellary': 'Anantapur','Bhilwara' : 'Ajmer','Bhusawal' : 'Dhule','Birbhum' : 'Bardhaman','Bulandshahr' : 'Hapur',
                'Burhanpur' : 'Jalgaon','Chittorgarh' : 'Udaipur','Contai' : 'Kharagpur','Dewas' : 'Indore','Dhar' : 'Indore',
                'Dhenkanal' : 'Cuttack','Dindigul' : 'Madurai','Firozabad' : 'Agra','Gangtok' : 'Darjeeling','Gorakhpur' : 'Varanasi','Gudivada' : 'Vijayawada',
                'Jaisalmer' : 'Jaipur','Jalpaiguri' : 'Siliguri','Jhajjar' : 'Rohtak','Jorhat' : 'Nagaon','Kaithal' : 'Patiala','Karimnagar' : 'Warangal',
                'Karur' : 'Salem','Katni' : 'Jabalpur','Kaushambi' : 'Satna','Khandwa' : 'Jalgaon','Kolar' : 'Vellore','Malegaon' : 'Dhule',
                'Mancherial' : 'Warangal','Mandsaur' : 'Ujjain','Morena' : 'Gwalior','Namakkal' : 'Salem','Palani' : 'Madurai','Panchmahal' : 'Godhra',
                'Patan' : 'Gandhinagar','Pathankot' : 'Jammu','Pollachi' : 'Coimbatore','Purulia' : 'Dhanbad','Raebareli' : 'Lucknow','Rajsamand' : 'Udaipur',
                'Ramgarh' : 'Nainital','Rampur' : 'Moradabad','Ratlam' : 'Ujjain','Rupnagar' : 'Chandigarh','Shimoga' : 'Mangalore','Sirsa' : 'Patiala',
                'Sivasagar' : 'Dibrugarh','Tezpur' : 'Guwahati','Theni' : 'Madurai','Thiruvarur' : 'Thanjavur','Tiruchengode' : 'Erode',
                'Tumkur' : 'Bangalore','Viramgam' : 'Gandhinagar','Wayanad' : 'Ooty'}

In [23]:
replace_list = list(np.setdiff1d(b,a))

In [24]:
train['price_per_sqft'] = train['TARGET(PRICE_IN_LACS)']/train['SQUARE_FT']

mean_price_per_sqft = train.groupby('City')['price_per_sqft'].mean().to_dict()
median_price_per_sqft = train.groupby('City')['price_per_sqft'].median().to_dict()
min_price_per_sqft = train.groupby('City')['price_per_sqft'].min().to_dict()
max_price_per_sqft = train.groupby('City')['price_per_sqft'].max().to_dict()

In [25]:
def price_per_sqft_imputer(x, dictionary):
    if x in replace_list:
        return -1 #dictionary[city_replace[x]]
    return dictionary[x]

In [26]:
df['mean_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, mean_price_per_sqft))
df['median_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, median_price_per_sqft))

df['min_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, min_price_per_sqft))
df['max_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, max_price_per_sqft))

In [27]:
median_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].median().to_dict()
min_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].min().to_dict()
max_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].max().to_dict()

In [28]:
df['med_price_per_city'] = df['City'].apply(lambda x : median_price_per_city_dict[ city_replace[x]] if x in replace_list else median_price_per_city_dict[x])
df['min_price_per_city'] = df['City'].apply(lambda x : min_price_per_city_dict[ city_replace[x]] if x in replace_list else min_price_per_city_dict[x])
df['max_price_per_city'] = df['City'].apply(lambda x : max_price_per_city_dict[ city_replace[x]] if x in replace_list else max_price_per_city_dict[x])

Preprocessing new features

In [29]:
col_ls = ['SQUARE_FT', 'LONGITUDE', 'LATITUDE','median_sqft_per_bhkno',
       'min_sqft_per_bhkno', 'max_sqft_per_bhkno', 'mean_sqft_per_location',
       'min_sqft_per_location', 'max_sqft_per_location',
       'median_sqft_per_city', 'min_sqft_per_city', 'max_sqft_per_city',
       'median_sqft_per_address', 'min_sqft_per_address',
       'max_sqft_per_address', 'sqft_per_room', 'mean_price_per_sqft',
       'median_price_per_sqft', 'min_price_per_sqft', 'max_price_per_sqft',
       'med_price_per_city', 'min_price_per_city', 'max_price_per_city']


for col in col_ls:
    df[col] = df[col].astype('float32')

In [31]:
le = LabelEncoder()
df[[ 'City', 'Address']] = df[[ 'City', 'Address']].apply(le.fit_transform)

In [32]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)

In [33]:
target = 'TARGET(PRICE_IN_LACS)'


features = [col for col in df.columns if col not in ([target])]

In [39]:
##### Input for model
X = train_proc[features]

##### Target column
train_labels = train_proc[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]

In [51]:
# Setup cross validation folds
kf = KFold(n_splits=8, random_state=42, shuffle=True)

In [45]:
#Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [58]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=1000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=1000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=1000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=800,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using rf
stack_gen = StackingCVRegressor(regressors=(rf, lightgbm, gbr, xgboost),
                                meta_regressor=rf,
                                use_features_in_secondary=True)

In [52]:
scores = {}

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

lightgbm: 0.3534 (0.0102)


In [53]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

xgboost: 0.3192 (0.0088)


In [55]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

rf: 0.2976 (0.0106)


In [56]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

gbr: 0.3295 (0.0108)


In [59]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(train_labels))

stack_gen


In [60]:
print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, train_labels)

lightgbm


In [61]:
print('xgboost')
xgb_model_full_data = xgboost.fit(X, train_labels)

xgboost


In [62]:
print('RandomForest')
rf_model_full_data = rf.fit(X, train_labels)

RandomForest


In [63]:
print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, train_labels)

GradientBoosting


In [64]:
# Blend models in order to make the final predictions more robust to overfitting
def blended_predictions(X):
    return (((0.15 * gbr_model_full_data.predict(X)) + \
            (0.2 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.25 * rf_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X)))))

In [65]:
# Get final precitions from the blended model
blended_score = rmsle(train_labels, blended_predictions(X))
scores['blended'] = (blended_score, 0)
print('RMSLE score on train data:')
print(blended_score)

RMSLE score on train data:
0.2489789795914987


In [67]:
# Append predictions from blended models
sample.loc[:,1] = np.expm1(blended_predictions(X_test))

In [68]:
sample.drop('TARGET(PRICE_IN_LACS)', axis=1, inplace=True)

In [69]:
sample.rename(columns = {1:'TARGET(PRICE_IN_LACS)'}, inplace=True)

In [70]:
sample

Unnamed: 0,TARGET(PRICE_IN_LACS)
0,13.932577
1,74.622303
2,65.176656
3,38.699031
4,13.084200
...,...
68715,108.346942
68716,118.759741
68717,3459.430781
68718,83.176350


In [71]:
from google.colab import files
sample.to_csv('submission.csv')
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>