In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [4]:
df.shape

(13320, 9)

In [5]:
df['location'].value_counts()

Whitefield                   540
Sarjapur  Road               399
Electronic City              302
Kanakpura Road               273
Thanisandra                  234
                            ... 
Electronic City Phase 1,       1
K R C kothanur                 1
Jay an agar 4 T Block          1
3rd Block HBR Layout           1
Banashankari Stage I           1
Name: location, Length: 1305, dtype: int64

In [6]:
df['society'].value_counts()

GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Srarkav     1
DSalehn     1
Srowsrj     1
Saandtt     1
Saavear     1
Name: society, Length: 2688, dtype: int64

In [7]:
len(df['society'].unique())

2689

In [8]:
len(df['location'].unique())

1306

In [9]:
df1=df.drop(['price'],axis=1)

In [10]:
df1['total_sqft'] = df1['total_sqft'].apply(lambda x: x.split(' - ')[1] if (len(x.split(' - ')))>1 else x.split(' - ')[0]) 
df1['total_sqft'] = df1['total_sqft'].apply(lambda x: re.findall(r'\d+',x)[0])
df1['total_sqft'] = df1['total_sqft'].astype(np.float64)

In [11]:
numerical_features= [feature for feature in df1.columns if df1[feature].dtypes!='O']
discrete_features = [feature for feature in numerical_features if len(df1[feature].unique())<25]
continous_features = [feature for feature in numerical_features if feature not in discrete_features]
categorical_features = [feature for feature in df1.columns if feature not in numerical_features]
print("numerical features:",numerical_features)
print("discrete features:",discrete_features)
print("continous features:",continous_features)
print("categorical features:",categorical_features)

numerical features: ['total_sqft', 'bath', 'balcony']
discrete features: ['bath', 'balcony']
continous features: ['total_sqft']
categorical features: ['area_type', 'availability', 'location', 'size', 'society']


In [12]:
df1['location']=df1['location'].fillna(value=df1['location'].value_counts().idxmax())
df1['size']=df1['size'].fillna(value=df1['size'].value_counts().idxmax())
df1['society']=df1['society'].fillna(value=df1['society'].value_counts().idxmax())

In [13]:
for feature in discrete_features:
    df1[feature]=df1[feature].fillna(value=df1[feature].value_counts().idxmax())

In [14]:
df1.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for feature in categorical_features:
    df1[feature]=le.fit_transform(df1[feature])

In [16]:
from sklearn.feature_selection import mutual_info_regression

In [17]:
mutual_info=mutual_info_regression(df1,df['price'])
mutual_info

array([0.12952567, 0.10809246, 0.3696511 , 0.42187842, 0.37088014,
       0.89820808, 0.40329357, 0.09096636])

In [18]:
mutual_info=pd.Series(mutual_info)
mutual_info.index=df1.columns
mutual_info.sort_values(ascending=False)

total_sqft      0.898208
size            0.421878
bath            0.403294
society         0.370880
location        0.369651
area_type       0.129526
availability    0.108092
balcony         0.090966
dtype: float64

In [19]:
features=mutual_info.nlargest(5).index
features

Index(['total_sqft', 'size', 'bath', 'society', 'location'], dtype='object')

In [20]:
df1[features].head()

Unnamed: 0,total_sqft,size,bath,society,location
0,1056.0,13,2.0,464,419
1,2600.0,19,5.0,2439,317
2,1440.0,16,2.0,806,1179
3,1521.0,16,3.0,2186,757
4,1200.0,13,2.0,806,716


In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df1[features],df['price'],test_size=0.1,shuffle=True)

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [23]:
rf.fit(X_train,Y_train)

RandomForestRegressor()

In [25]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
preds=rf.predict(X_test)
print(mean_absolute_error(preds,Y_test))
print(np.sqrt(mean_squared_error(preds,Y_test)))
print(- np.sqrt(np.square(np.log10(preds +1) - np.log10(Y_test +1)).mean()))

34.6673426692698
97.60630256165474
-0.15721713105718566


In [27]:
preds=rf.predict(X_train)
print(mean_absolute_error(preds,Y_train))
print(np.sqrt(mean_squared_error(preds,Y_train)))
print(- np.sqrt(np.square(np.log10(preds +1) - np.log10(Y_train +1)).mean()))

12.813727743667286
37.0587800272615
-0.06436535224127594


In [28]:
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]
max_features=['auto','sqrt']
max_depth=[int(x) for x in np.linspace(start=5,stop=30,num=6)]
min_samples_split=[2,5,10,15,100]
min_samples_leaf=[1,2,5,10]

In [29]:
grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
print(grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [30]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions=grid,scoring='neg_mean_squared_error',n_iter=10,cv = 5,random_state=42,verbose=2,n_jobs = 1)

In [31]:
rf_random.fit(X_train,Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   5.5s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.4s remaining:    0.0s


[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   5.5s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   5.8s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   6.0s
[CV] n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10 
[CV]  n_estimators=900, min_samples_split=5, min_samples_leaf=5, max_features=sqrt, max_depth=10, total=   6.0s
[CV] n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15 
[CV]  n_estimators=1100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=15, total=  10.0s
[CV] n_estimators=1100, min_samples_split=10, mi

[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.8s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.3s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   2.9s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.2s
[CV] n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=700, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   3.1s
[CV] n_estimators=700, min_samples_split=15, min_sam

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.6min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [32]:
rf_random.best_score_

-9086.309462057232

In [35]:
preds=rf_random.predict(X_test)
print(mean_absolute_error(preds,Y_test))
print(np.sqrt(mean_squared_error(preds,Y_test)))
print(- np.sqrt(np.square(np.log10(preds +1) - np.log10(Y_test +1)).mean()))

34.860232023830505
92.27971289045504
-0.1540060690855065


In [36]:
import xgboost as xgb
xgr=xgb.XGBRegressor()
xgr.fit(X_train,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
import math
preds=xgr.predict(X_test)
print(mean_absolute_error(preds,Y_test))
print(math.sqrt(mean_squared_error(Y_test,preds)))

32.8463187268976
90.67771842623661


In [38]:
min_child_weight=[1,2,3,4,5,6]
max_depth=[int(x) for x in np.linspace(start=5,stop=30,num=6)]
gamma=[i/10.0 for i in range(0,5)]
subsample=[i/10.0 for i in range(6,10)]
colsample_bytree=[i/10.0 for i in range(6,10)]
reg_alpha=[1e-5, 1e-2, 0.1, 1, 100,0, 0.001, 0.005,0.05]
learning_rate=[0.05,0.025,0.075,0.25,0.5,0.75,1]


In [39]:
grid={'min_child_weight':min_child_weight,'max_depth':max_depth,'gamma':gamma,'subsample':subsample,'colsample_bytree':colsample_bytree,'reg_alpha':reg_alpha,'learning_rate':learning_rate}
print(grid)

{'min_child_weight': [1, 2, 3, 4, 5, 6], 'max_depth': [5, 10, 15, 20, 25, 30], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4], 'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'reg_alpha': [1e-05, 0.01, 0.1, 1, 100, 0, 0.001, 0.005, 0.05], 'learning_rate': [0.05, 0.025, 0.075, 0.25, 0.5, 0.75, 1]}


In [40]:
xgr_random = RandomizedSearchCV(estimator = xgr, param_distributions=grid,scoring='neg_mean_squared_error',n_iter=10,cv = 5,random_state=42,verbose=2,n_jobs = 1)

In [41]:
xgr_random.fit(X_train,Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8, total=   0.4s
[CV] subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8, total=   0.4s
[CV] subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8 
[CV]  subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8, total=   0.4s
[CV] subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8 
[CV]  subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8, total=   0.4s
[CV] subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8 
[CV]  subsample=0.8, reg_alpha=0.001, min_child_weight=4, max_depth=5, learning_rate=0.25, gamma=0.3, colsample_bytree=0.8, total=   0.4s
[CV] subsample=0.9, reg_alpha=0, min_child_weight=6, max_depth=10, learning_rat

[CV]  subsample=0.8, reg_alpha=0, min_child_weight=1, max_depth=15, learning_rate=0.05, gamma=0.1, colsample_bytree=0.7, total=   1.3s
[CV] subsample=0.8, reg_alpha=0, min_child_weight=1, max_depth=15, learning_rate=0.05, gamma=0.1, colsample_bytree=0.7 
[CV]  subsample=0.8, reg_alpha=0, min_child_weight=1, max_depth=15, learning_rate=0.05, gamma=0.1, colsample_bytree=0.7, total=   1.4s
[CV] subsample=0.7, reg_alpha=0.05, min_child_weight=5, max_depth=30, learning_rate=0.05, gamma=0.0, colsample_bytree=0.9 
[CV]  subsample=0.7, reg_alpha=0.05, min_child_weight=5, max_depth=30, learning_rate=0.05, gamma=0.0, colsample_bytree=0.9, total=   2.1s
[CV] subsample=0.7, reg_alpha=0.05, min_child_weight=5, max_depth=30, learning_rate=0.05, gamma=0.0, colsample_bytree=0.9 
[CV]  subsample=0.7, reg_alpha=0.05, min_child_weight=5, max_depth=30, learning_rate=0.05, gamma=0.0, colsample_bytree=0.9, total=   2.1s
[CV] subsample=0.7, reg_alpha=0.05, min_child_weight=5, max_depth=30, learning_rate=0.05

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.0min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=100, n_jobs=0,
                                          num_par...
                   n_jobs=1,
                   param_distributions={'colsample_bytree': [0.6, 0.7, 0.8,
                                          

In [42]:
preds=xgr_random.predict(X_test)
print(mean_absolute_error(preds,Y_test))
print(math.sqrt(mean_squared_error(Y_test,preds)))
print(- np.sqrt(np.square(np.log10(preds +1) - np.log10(Y_test +1)).mean()))

32.34402824814255
86.614431308321
-0.1474811545998762


In [43]:
preds=xgr_random.predict(X_train)
print(mean_absolute_error(preds,Y_train))
print(math.sqrt(mean_squared_error(Y_train,preds)))

22.738854803636464
64.75819215212213


In [44]:
preds=rf_random.predict(X_train)
print(mean_absolute_error(preds,Y_train))
print(math.sqrt(mean_squared_error(Y_train,preds)))

27.569704558646617
72.52501618469651


In [45]:
preds=xgr.predict(X_train)
print(mean_absolute_error(preds,Y_train))
print(math.sqrt(mean_squared_error(Y_train,preds)))

18.605360060756432
34.0470297019639


In [46]:
df_test=pd.read_csv('test.csv')

In [47]:
len(df_test['society'].unique())

595

In [48]:
len(df['location'].unique())

1306

In [49]:
df_test.isnull().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [50]:
for feature in discrete_features:
    df_test[feature]=df_test[feature].fillna(value=df_test[feature].value_counts().idxmax())

In [51]:
for feature in categorical_features:
    df_test[feature]=df_test[feature].fillna(value=df_test[feature].value_counts().idxmax())

In [52]:
le=LabelEncoder()

for feature in categorical_features:
    df_test[feature]=le.fit_transform(df_test[feature])

In [53]:
df_test.isnull().sum()

area_type          0
availability       0
location           0
size               0
society            0
total_sqft         0
bath               0
balcony            0
price           1480
dtype: int64

In [55]:
df_test['total_sqft'] = df_test['total_sqft'].apply(lambda x: x.split(' - ')[1] if (len(x.split(' - ')))>1 else x.split(' - ')[0]) 
df_test['total_sqft'] = df_test['total_sqft'].apply(lambda x: re.findall(r'\d+',x)[0])
df_test['total_sqft'] = df_test['total_sqft'].astype(np.float64)

In [56]:
preds=xgr_random.predict(df_test[features])

In [57]:
df_test['price']=preds

In [58]:
df_test.to_csv('submission.csv',index=False)