In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set()

In [3]:
from sklearn.linear_model import LinearRegression
from statsmodels.regression.linear_model import OLS

In [4]:
train = pd.read_csv('Fina_Train.csv')

In [5]:
test = pd.read_csv('Fina_Test.csv')

In [6]:
y = train['TARGET(PRICE_IN_LACS)']
x = train.drop(['TARGET(PRICE_IN_LACS)','ADDRESS','CITY'],axis=1)


In [7]:
test.isna().sum()

POSTED_BY             0
UNDER_CONSTRUCTION    0
RERA                  0
BHK_NO.               0
BHK_OR_RK             0
SQUARE_FT             0
READY_TO_MOVE         0
RESALE                0
ADDRESS               0
LONGITUDE             0
LATITUDE              0
CITY                  0
CITY_2                0
ZONE                  0
dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [9]:
total_col = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK',
       'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'ADDRESS', 'LONGITUDE',
       'LATITUDE', 'CITY', 'CITY_2', 'ZONE']

In [10]:
cat_col = ['POSTED_BY','CITY_2','ZONE']

In [11]:
encoder = OneHotEncoder()
encoded_cat = encoder.fit_transform(x[cat_col])

In [12]:
col_name = encoder.get_feature_names(cat_col)


In [13]:
temp =pd.DataFrame(data=encoded_cat.toarray(),columns=col_name)

In [14]:
bhk_or_rak = {'BHK':1,
             'RK':0}

In [15]:
x['BHK_OR_RK'] = x['BHK_OR_RK'].replace(bhk_or_rak)

In [16]:
final_x = x.merge(temp,on=x.index).copy()

In [17]:
final_x = final_x.drop(['key_0','POSTED_BY','CITY_2','ZONE','UNDER_CONSTRUCTION','ZONE_0', 'ZONE_1', 'ZONE_2', 'ZONE_3',
       'ZONE_4', 'ZONE_5', 'ZONE_6', 'ZONE_7', 'ZONE_8','POSTED_BY_Builder','CITY_2_Agra'],axis=1)

In [18]:
final_x.columns

Index(['RERA', 'BHK_NO.', 'BHK_OR_RK', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE',
       'LONGITUDE', 'LATITUDE', 'POSTED_BY_Dealer', 'POSTED_BY_Owner',
       ...
       'CITY_2_Udaipur', 'CITY_2_Udupi', 'CITY_2_Vadodara', 'CITY_2_Valsad',
       'CITY_2_Vapi', 'CITY_2_Varanasi', 'CITY_2_Vijayawada',
       'CITY_2_Visakhapatnam', 'CITY_2_Vizianagaram', 'CITY_2_Wardha'],
      dtype='object', length=136)

## Split

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [20]:
'''scaler = StandardScaler().fit(final_x)
final_x = scaler.transform(final_x)'''

'scaler = StandardScaler().fit(final_x)\nfinal_x = scaler.transform(final_x)'

In [21]:
x_train,x_test,y_train,y_test = train_test_split(final_x,y,test_size=0.2,random_state=37)

In [22]:
x_train.shape,x_test.shape

((21390, 136), (5348, 136))

In [23]:
from sklearn.metrics import mean_squared_log_error

In [24]:
algo=[]
train_accuracy=[]
test_accuracy=[]

## using OLS

In [25]:
reg = OLS(y_train,x_train)
reg = reg.fit()
print(reg.summary())
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(reg.predict(x_train)))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(reg.predict(x_test)))*100
#algo.append('OLS')
#train_accuracy.append(train_acc)
#test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


                                  OLS Regression Results                                  
Dep. Variable:     TARGET(PRICE_IN_LACS)   R-squared (uncentered):                   0.992
Model:                               OLS   Adj. R-squared (uncentered):              0.992
Method:                    Least Squares   F-statistic:                          2.014e+04
Date:                   Sat, 17 Oct 2020   Prob (F-statistic):                        0.00
Time:                           15:21:57   Log-Likelihood:                         -9198.3
No. Observations:                  21390   AIC:                                  1.867e+04
Df Residuals:                      21254   BIC:                                  1.975e+04
Df Model:                            136                                                  
Covariance Type:               nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-

## using  sk-learn Linear Regression

In [26]:
reg = LinearRegression(normalize=False)
score = reg.fit(x_train,y_train)
pred = score.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('Linear Regression')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


Train 13.368698719027178
Test 13.415593340294041


##  Poly Regression

In [27]:
from sklearn.preprocessing import PolynomialFeatures

In [28]:
from sklearn.decomposition import PCA

In [43]:
pca = PCA(0.99)
pca.fit(x_train,y=y_train)
pca.n_components_

81

In [44]:
temp =pca.transform(x_train)
temp2 = pca.transform(x_test)

In [45]:
x_poly_gen = PolynomialFeatures(degree=2)
x_poly_train = x_poly_gen.fit_transform(temp)
x_poly_train.shape

(21390, 3403)

#### sklearn

In [32]:
reg = LinearRegression(normalize=False)
score = reg.fit(x_poly_train,y_train)
pred = score.predict(x_poly_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_poly_gen.transform(temp2))))*100
algo.append('Poly Regression')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


Train 11.780416683759324
Test 11.65288601520986


#### OLS

In [46]:
reg = OLS(y_train,temp)
score = reg.fit()
print(score.summary())
pred = score.predict(temp)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(temp2)))*100
algo.append('OLS Poly Regression')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


                                  OLS Regression Results                                  
Dep. Variable:     TARGET(PRICE_IN_LACS)   R-squared (uncentered):                   0.023
Model:                               OLS   Adj. R-squared (uncentered):              0.019
Method:                    Least Squares   F-statistic:                              6.101
Date:                   Sat, 17 Oct 2020   Prob (F-statistic):                    1.09e-59
Time:                           15:49:14   Log-Likelihood:                         -60999.
No. Observations:                  21390   AIC:                                  1.222e+05
Df Residuals:                      21309   BIC:                                  1.228e+05
Df Model:                             81                                                  
Covariance Type:               nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [35]:
x_train

Unnamed: 0,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,POSTED_BY_Dealer,POSTED_BY_Owner,...,CITY_2_Udaipur,CITY_2_Udupi,CITY_2_Vadodara,CITY_2_Valsad,CITY_2_Vapi,CITY_2_Varanasi,CITY_2_Vijayawada,CITY_2_Visakhapatnam,CITY_2_Vizianagaram,CITY_2_Wardha
1699,1,1,1,6.745424,0,0,1.095326,4.322057,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3242,0,2,1,6.998866,1,1,1.132881,4.293195,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13460,1,2,1,7.337828,1,1,1.165152,4.362065,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6879,0,2,1,6.984876,1,1,1.191602,4.328431,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22919,0,2,1,6.872195,1,1,1.192203,4.327824,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4118,1,3,1,7.203528,0,1,1.083389,4.288715,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
988,0,3,1,7.142939,1,1,1.208999,4.349110,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9036,1,3,1,7.417581,1,1,0.937708,4.353961,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20843,1,2,1,6.770823,0,1,1.084346,4.289964,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### SVM

In [47]:
from sklearn.svm import SVR

In [48]:
reg = SVR(verbose=True)
score = reg.fit(temp,y_train)
pred = score.predict(temp)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(temp2)))*100
algo.append('SVM')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


[LibSVM]Train 10.74056859384048
Test 11.545263485289903


### Decision Tree

In [50]:
from sklearn.tree import DecisionTreeRegressor

In [54]:
reg = DecisionTreeRegressor()
score = reg.fit(x_train,y_train)
pred = score.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('DTR')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


Train 0.06378469623864416
Test 18.724526163789832


### Random Forest

In [87]:
from sklearn.ensemble import RandomForestRegressor

In [133]:
reg = RandomForestRegressor(n_estimators=100,verbose=1,n_jobs=3,max_depth=6)
score = reg.fit(x_train,y_train)
pred = score.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('Random Forest ')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


Train 85.56589557158826
Test 85.85674847456711


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


### XGBoost

In [89]:
from xgboost import XGBRegressor

In [156]:
reg = XGBRegressor(n_estimators=1000,objective='reg:squaredlogerror',verbosity=1,tree_method='exact',n_jobs=-1,max_feature=)
score = reg.fit(x_train,y_train)
pred = score.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('XGBoost ')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


ValueError: Found input variables with inconsistent numbers of samples: [21390, 26738]

In [157]:
train_acc = 100-mean_squared_log_error(np.exp(y),np.exp(pred))*100
#test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('XGBoost ')
train_accuracy.append(train_acc)
#test_accuracy.append(test_acc)
print('Train',train_acc)
#print('Test',test_acc)


Train 95.99959795464352


#### Grid Search

In [123]:
from sklearn.model_selection import GridSearchCV

In [124]:
xgb1 = XGBRegressor()
parameters = { #when use hyperthread, xgboost may become slower
              'num_estimator' :[10,100,500,100],
              'objective':['reg:squaredlogerror'],
              'learning_rate': [0.4,.3, 0.5, .7], #so called `eta` value
              'max_depth': [5, 6, 7,8,9,10],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(x_train,
         y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  3.1min finished


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0.8207349566278259
{'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 7, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:squaredlogerror', 'silent': 1, 'subsample': 0.7}


In [127]:
pred = xgb_grid.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(xgb_grid.predict(x_test)))*100
algo.append('XGBoost ')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)

Train 92.43347584823377
Test 91.88844906073962


### XGBRF

In [70]:
from xgboost import XGBRFRegressor

In [84]:
reg = XGBRFRegressor(n_estimators=100,n_jobs=2,verbosity =1)
score = reg.fit(x_train,y_train)
pred = score.predict(x_train)
train_acc = 100-mean_squared_log_error(np.exp(y_train),np.exp(pred))*100
test_acc = 100-mean_squared_log_error(np.exp(y_test),np.exp(score.predict(x_test)))*100
algo.append('XGRF Boost ')
train_accuracy.append(train_acc)
test_accuracy.append(test_acc)
print('Train',train_acc)
print('Test',test_acc)


Train 86.02867738449031
Test 86.28407912757598


## Cross Validation

In [123]:
from sklearn.model_selection import KFold,cross_validate

In [124]:
scv = KFold(n_splits=5,shuffle=True,random_state=37)

In [125]:
score = cross_validate(reg,final_x,y,scoring='neg_mean_squared_log_error',cv=scv,n_jobs=3,verbose=2)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   5 | elapsed:   20.3s remaining:   13.5s
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   20.9s finished


In [126]:
-1*score['test_score']

array([0.00513358, 0.0052364 , 0.00491934, 0.00532357, 0.00539321])

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(np.exp(y_train),np.exp(pred),alpha=0.2)

## Prediction

In [158]:
test = pd.read_csv('Fina_Test.csv')
test = test.drop(['ADDRESS','CITY'],axis=1)

In [159]:
encoded_cat = encoder.transform(test[cat_col])

In [160]:
temp =pd.DataFrame(data=encoded_cat.toarray(),columns=col_name)

In [161]:
test['BHK_OR_RK'] = test['BHK_OR_RK'].replace(bhk_or_rak)

In [162]:
test

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,CITY_2,ZONE
0,Owner,0,0,1,1,6.301100,1,1,1.117408,4.291113,Surat,7
1,Dealer,1,1,2,1,6.684612,0,0,1.079312,4.292489,Lalitpur,0
2,Dealer,0,0,2,1,7.136560,1,1,1.137065,4.482832,Kolkata,3
3,Dealer,0,0,3,1,7.244463,1,1,1.192529,4.325253,Jaipur,1
4,Owner,0,0,1,1,6.064896,1,1,1.138591,4.288677,Town,7
...,...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,1,6.752919,1,1,1.083116,4.289957,Maharashtra,0
68716,Dealer,0,1,3,1,7.742466,1,1,1.230516,4.340502,Mohali,1
68717,Dealer,1,1,1,1,10.415197,0,0,1.083858,4.290298,Maharashtra,0
68718,Dealer,0,0,2,1,7.067924,1,1,1.070758,4.303277,Pune,0


In [163]:
final_test = test.merge(temp,on=test.index).copy()

In [164]:
final_test = final_test.drop(['key_0','POSTED_BY','CITY_2','ZONE','UNDER_CONSTRUCTION','ZONE_0', 'ZONE_1', 'ZONE_2', 'ZONE_3',
       'ZONE_4', 'ZONE_5', 'ZONE_6', 'ZONE_7', 'ZONE_8','POSTED_BY_Builder','CITY_2_Agra'],axis=1)

In [165]:
final_test.shape

(68720, 136)

In [166]:
final_test

Unnamed: 0,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE,POSTED_BY_Dealer,POSTED_BY_Owner,...,CITY_2_Udaipur,CITY_2_Udupi,CITY_2_Vadodara,CITY_2_Valsad,CITY_2_Vapi,CITY_2_Varanasi,CITY_2_Vijayawada,CITY_2_Visakhapatnam,CITY_2_Vizianagaram,CITY_2_Wardha
0,0,1,1,6.301100,1,1,1.117408,4.291113,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,1,6.684612,0,0,1.079312,4.292489,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,1,7.136560,1,1,1.137065,4.482832,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,1,7.244463,1,1,1.192529,4.325253,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,1,6.064896,1,1,1.138591,4.288677,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68715,1,2,1,6.752919,1,1,1.083116,4.289957,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68716,1,3,1,7.742466,1,1,1.230516,4.340502,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68717,1,1,1,10.415197,0,0,1.083858,4.290298,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68718,0,2,1,7.067924,1,1,1.070758,4.303277,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
final_test['TARGET(PRICE_IN_LACS)'] = np.exp(reg.predict(final_test))

In [168]:
final_test['TARGET(PRICE_IN_LACS)']

0         15.283757
1         67.585739
2         59.023472
3         30.895584
4         17.949753
            ...    
68715    100.361580
68716    121.485878
68717    114.449562
68718     78.290550
68719    513.742981
Name: TARGET(PRICE_IN_LACS), Length: 68720, dtype: float32

In [169]:
final_test['TARGET(PRICE_IN_LACS)'].to_csv('Submission 5.csv',index=False)