In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.svm import SVR 

import seaborn as sns

In [80]:
df = pd.read_csv('./house_data_set_cleaned_2.csv')

In [81]:
df.head()

Unnamed: 0.1,Unnamed: 0,Location,Price,Beds,Baths,House Size,Land Size
0,0,Piliyandala,12500000,4,2,1750.0,6.5
1,1,Kottawa,29500000,4,4,3510.0,9.0
2,2,Malabe,19000000,4,3,2850.0,7.5
3,3,Kottawa,35000000,7,4,1700.0,15.0
4,4,Boralesgamuwa,29500000,5,4,3500.0,8.0


<h1>Convert Location using One Hot Encoder

In [82]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['Location'].values.reshape(-1,1))
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


  y = column_or_1d(y, warn=True)


In [84]:
ohe_df = pd.DataFrame(onehot_encoded)
# data = pd.concat([df, ohe_df], axis=1).drop(['Location'], axis=1)
data = pd.concat([df, ohe_df], axis=1)

In [85]:
data.columns

Index(['Unnamed: 0',   'Location',      'Price',       'Beds',      'Baths',
       'House Size',  'Land Size',            0,            1,            2,
                  3,            4,            5,            6,            7,
                  8,            9,           10,           11,           12,
                 13,           14,           15,           16,           17,
                 18,           19,           20,           21,           22,
                 23,           24,           25,           26,           27,
                 28,           29,           30,           31,           32,
                 33,           34,           35,           36,           37,
                 38,           39],
      dtype='object')

<h1>Split data set into 3 parts based on Price

In [86]:
data['h_l_ratio'] = data['House Size'].apply(lambda x: x * 0.0036730945821854912)/ data['Land Size']
data['Bed Size'] = data['House Size'] / data['Beds']
data['beds_bath_ratio'] = data['Beds']/data['Baths']

In [87]:
data = data.drop(['Unnamed: 0'], axis=1)

In [88]:
df_range_1 = data.query('Price < 2.0e7')
df_range_2 = data.query('Price >= 2.0e7 & Price < 4.0e7')
df_range_3 = data.query('Price > 4.0e7' )

In [89]:
df_range_1.shape

(3162, 49)

In [90]:
df_range_2.shape

(3164, 49)

In [91]:
df_range_3.shape

(1211, 49)

<h1>Important Features combinations

In [92]:
features_important =[
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths', 'Land Size', ],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths','House Size', 'h_l_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths', 'Land Size', 'h_l_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths', 'Land Size', 'Bed Size'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'House Size', 'Land Size', 'beds_bath_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'House Size', 'h_l_ratio', 'beds_bath_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Land Size', 'h_l_ratio', 'beds_bath_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
     0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths', 'Land Size', 'h_l_ratio', 'beds_bath_ratio'],
    [0,            1,            2,            3,
                  4,            5,            6,            7,            8,
                  9,           10,           11,           12,           13,
                 14,           15,           16,           17,           18,
                 19,           20,           21,           22,           23,
                 24,           25,           26,           27,           28,
                 29,           30,           31,           32,           33,
                 34,           35,           36,           37,           38,
                 39, 'Beds', 'Baths', 'h_l_ratio', 'Bed Size', 'beds_bath_ratio']  
]

In [93]:
def build_rf_models(imp_features, df_range):
    rf_results = []
    for index, features in enumerate(imp_features):
        model = RandomForestRegressor(n_jobs=-1)
        max_scores_param = []
        X_train, X_test, Y_train, Y_test = train_test_split(
            df_range[features],
            df_range['Price'],
            test_size=0.33,
            random_state=42
        )
        estimators = np.arange(10, 400, 10)
        scores = []
        rmse = 0.0
        rmses = []
        for n in estimators:
            model.set_params(n_estimators=n)
            model.fit(X_train, Y_train)
            pred_y = model.predict(X_test)
            rmse = np.sqrt(((pred_y - Y_test) ** 2).mean())
            score = model.score(X_test, Y_test)
        scores.append(score)
        rmses.append(rmse)
    max_scores_param.append((max(scores), estimators[scores.index(max(scores))], rmses[scores.index(max(scores))] ))
    print((max(scores), estimators[scores.index(max(scores))], rmses[scores.index(max(scores))] ))
    rf_results.append((features, max_scores_param))
    print('#######################################################################################################################')
    return rf_results


In [30]:
build_rf_models(features_important, df_range_1)

(0.6491302323421233, 10, 2608312.736248724)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.6491302323421233, 10, 2608312.736248724)])]

In [31]:
build_rf_models(features_important, df_range_2)

(0.6088976720226134, 10, 3260146.0450336677)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.6088976720226134, 10, 3260146.0450336677)])]

In [32]:
build_rf_models(features_important, df_range_3)

(0.20298034018456146, 10, 56732410.44885345)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.20298034018456146, 10, 56732410.44885345)])]

In [15]:
def cluster(df_to_cluster, n_cluster):
    km = KMeans(
        n_clusters=n_cluster
    )
    km.fit(df_to_cluster)
    y_km = km.predict(df_to_cluster)
    return y_km

<h1>Clustering into 2

In [100]:
y_km_1 = cluster(df_range_1.drop(['Price', 'Location'],axis=1), 2)

In [101]:
df_range_1.loc[df_range_1[y_km_1 == 1].index].shape

(1, 49)

In [102]:
df_range_1.loc[df_range_1[y_km_1 == 0].index].shape

(3161, 49)

<h1>Remove That cluster

In [103]:
df_range_1 = df_range_1.drop(df_range_1[y_km_1 == 1].index, axis=0)

In [104]:
y_km_new_1 = np.delete(y_km_1,np.where(y_km_1==1)[0][0])

In [105]:
df_range_1.shape

(3161, 49)

<h1>Try RF with cluster(biggest cluster)

In [84]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_new_1 == 0].index])

KeyboardInterrupt: 

<h1>Try 3 clusters

In [106]:
y_km_1_3 = cluster(df_range_1.drop(['Price', 'Location'],axis=1), 3)

In [107]:
df_range_1.loc[df_range_1[y_km_1_3 == 0].index].shape

(1976, 49)

In [108]:
df_range_1.loc[df_range_1[y_km_1_3 == 1].index].shape

(8, 49)

In [109]:
df_range_1.loc[df_range_1[y_km_1_3 == 2].index].shape

(1177, 49)

In [89]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_3 == 0].index])

(0.5060636840604082, 10, 2753429.6719166567)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.5060636840604082, 10, 2753429.6719166567)])]

In [90]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_3 == 1].index])

(0.5476407582962461, 10, 2450137.1348725795)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.5476407582962461, 10, 2450137.1348725795)])]

In [113]:
df_range_1 = df_range_1.drop(df_range_1[y_km_1_3 == 2].index, axis=0)

In [114]:
build_rf_models(features_important, df_range_1)

(0.6155303908802869, 10, 2744476.286028711)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.6155303908802869, 10, 2744476.286028711)])]

<h1>Cluster Again

In [115]:
df_range_1.shape

(3153, 48)

In [116]:
y_km_1_2 = cluster(df_range_1.drop(['Price'],axis=1), 3)

In [117]:
df_range_1.loc[df_range_1[y_km_1_2 == 0].index].shape

(1239, 48)

In [118]:
df_range_1.loc[df_range_1[y_km_1_2 == 1].index].shape

(621, 48)

In [119]:
df_range_1.loc[df_range_1[y_km_1_2 == 2].index].shape

(1293, 48)

In [120]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_2 == 0].index])


(0.355852627114135, 10, 2858238.118940638)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.355852627114135, 10, 2858238.118940638)])]

In [121]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_2 == 1].index])


(0.386383373026031, 10, 2740678.143230195)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.386383373026031, 10, 2740678.143230195)])]

In [None]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_2 == 2].index])


<h1>Try clustering with price

In [111]:
y_km_1_2_price =  cluster(df_range_1.drop(['Location'], axis=1),2)

In [18]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_2_price == 0].index])

(0.46724281406089374, 10, 1493555.142991163)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.46724281406089374, 10, 1493555.142991163)])]

<h1> RMSE is above model = 14lahks

In [19]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_2_price == 1].index])

(0.48050917125367987, 10, 1937938.0007581837)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.48050917125367987, 10, 1937938.0007581837)])]

<h1>RMSE is 19laks

In [112]:
df_range_1[y_km_1_2_price == 1].shape

(1417, 49)

In [113]:
df_range_1[y_km_1_2_price == 0].shape

(1744, 49)

<h1>Try range1 with 3 cluster

In [115]:
y_km_1_3_price =  cluster(df_range_1.drop(['Location'], axis=1),3)

In [116]:
df_range_1[y_km_1_3_price == 2].shape

(823, 49)

In [120]:
df_range_1[y_km_1_3_price == 2].to_csv("range_1_cluster_1.csv")

In [117]:
df_range_1[y_km_1_3_price == 1].shape

(1031, 49)

In [121]:
df_range_1[y_km_1_3_price == 1].to_csv('range_1_cluster_2.csv')

In [118]:
df_range_1[y_km_1_3_price == 0].shape

(1307, 49)

In [122]:
df_range_1[y_km_1_3_price == 0].to_csv('range_1_cluster_3.csv')

In [22]:
y_km_1_3_price =  cluster(df_range_1,3)

In [24]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_3_price == 0].index])

(0.4118259958079, 10, 1125959.8193299184)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.4118259958079, 10, 1125959.8193299184)])]

In [25]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_3_price == 1].index])

(0.25858722050023886, 10, 1533336.4736452827)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.25858722050023886, 10, 1533336.4736452827)])]

In [26]:
build_rf_models(features_important, df_range_1.loc[df_range_1[y_km_1_3_price == 2].index])

(0.28458706561231417, 10, 1149891.8270019589)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.28458706561231417, 10, 1149891.8270019589)])]

<h1>Try 2 cluster with range 2

In [32]:
y_km_2_2_price =  cluster(df_range_2,2)

In [33]:
build_rf_models(features_important, df_range_2.loc[df_range_2[y_km_2_2_price == 0].index])

(0.358036319108728, 10, 2529170.880785753)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.358036319108728, 10, 2529170.880785753)])]

In [34]:
build_rf_models(features_important, df_range_2.loc[df_range_2[y_km_2_2_price == 1].index])

(0.46176673631190013, 10, 1732973.915464965)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.46176673631190013, 10, 1732973.915464965)])]

<h1>Try 3 clustes with range 2

In [124]:
y_km_2_3_price =  cluster(df_range_2.drop(['Location'], axis=1),3)

In [125]:
df_range_2[y_km_2_3_price == 0].shape

(1354, 49)

In [128]:
df_range_2[y_km_2_3_price == 0].to_csv('range_2_cluster_1.csv')

In [126]:
df_range_2[y_km_2_3_price == 1].shape

(1129, 49)

In [129]:
df_range_2[y_km_2_3_price == 1].to_csv('range_2_cluter_2.csv')

In [130]:
df_range_2[y_km_2_3_price == 2].shape

(681, 49)

In [36]:
build_rf_models(features_important, df_range_2.loc[df_range_2[y_km_2_3_price == 0].index])

(0.18415187163218527, 10, 1898621.3289477802)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.18415187163218527, 10, 1898621.3289477802)])]

In [131]:
df_range_2[y_km_2_3_price == 2].to_csv('range_2_cluter_3.csv')

In [37]:
build_rf_models(features_important, df_range_2.loc[df_range_2[y_km_2_3_price == 1].index])

(0.5169615133408777, 10, 1173997.9524873185)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.5169615133408777, 10, 1173997.9524873185)])]

In [38]:
build_rf_models(features_important, df_range_2.loc[df_range_2[y_km_2_3_price == 2].index])

(0.34703784495592416, 10, 1139008.8395743656)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.34703784495592416, 10, 1139008.8395743656)])]

<h1>Try 2 cluster with range 2

In [133]:
y_km_3_2_price =  cluster(df_range_3.drop(['Location'], axis=1),2)

In [134]:
df_range_3.to_csv('range_3.csv')

In [41]:
build_rf_models(features_important, df_range_3.loc[df_range_3[y_km_3_2_price == 0].index])

(0.5279554295566109, 10, 17732298.68724528)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.5279554295566109, 10, 17732298.68724528)])]

In [42]:
build_rf_models(features_important, df_range_3.loc[df_range_3[y_km_3_2_price == 1].index])

(-0.1312317684926203, 10, 121059489.95565465)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(-0.1312317684926203, 10, 121059489.95565465)])]

In [44]:
y_km_3_3_price =  cluster(df_range_3,3)

In [45]:
build_rf_models(features_important, df_range_3.loc[df_range_3[y_km_3_3_price == 0].index])

(0.4458267019311328, 10, 17330890.6955009)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.4458267019311328, 10, 17330890.6955009)])]

In [46]:
build_rf_models(features_important, df_range_3.loc[df_range_3[y_km_3_3_price == 1].index])

(0.07108333371317843, 10, 45766671.81792696)
#######################################################################################################################


[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(0.07108333371317843, 10, 45766671.81792696)])]

In [47]:
build_rf_models(features_important, df_range_3.loc[df_range_3[y_km_3_3_price == 2].index])



(nan, 10, 146923076.92307687)
#######################################################################################################################




[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   'Beds',
   'Baths',
   'h_l_ratio',
   'Bed Size',
   'beds_bath_ratio'],
  [(nan, 10, 146923076.92307687)])]

<h1>Best clusters to continue

<ul>
    <li>Range 1
        <ul>
            <li>Price Range : Below 2 crore </li>
            <li>Choosen no of clusters: 3 </li>
            <li>RMSE cluster 1 (1307 instances): 11 lakhs </li>
            <li>RMSE cluster 2 (1031)          : 15 lakhs</li>
            <li>RMSE cluster 3 (823)           : 11 lakhs</li>
        </ul>
    </li>
    <li>Range 2
        <ul>
           <li>Price Range :  2 crore to  4 crore </li>
           <li>Choosen no of clusters: 3 </li>
           <li>RMSE cluster 1  (1354 instances) : 19lakhs </li>
            <li>RMSE cluster 2 (1129)           : 11lakhs</li>
            <li>RMSE cluster 3 (681)            : 11 lakhs</li>
        </ul>
    </li>
    <li>Range 3
         <ul>
           <li>Price Range :  Above 4 crore </li>
           <li>RMSE:Not Decided ideal number of clusters due to high RMSE </li>
           
        </ul>
    </li>

 </ul>

In [None]:
def to_locations(df, cols):
    

In [None]:
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(df['Location'].values.reshape(-1,1))
# # binary encode
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In [54]:
onehot_encoded[0:, ].shape

(7609, 40)

In [55]:
cols = list(range(0,40))

In [78]:
np.array([[np.argmax(data[cols].values[0, :])]])[0][0]

35

In [77]:
label_encoder.inverse_transform([np.argmax(onehot_encoded[7609, :])])

IndexError: index 7609 is out of bounds for axis 0 with size 7609

In [136]:
np.save('location_label_mapping.npy', label_encoder.classes_)

In [137]:
import pickle
with open('one_hot_encoder', 'wb') as file:
  pickle.dump(onehot_encoder, file)

In [140]:
loaded_one_hot_encoder = None;
with open('one_hot_encoder', 'rb') as file:
    loaded_one_hot_encoder = pickle.load(file)
    print(loaded_one_hot_encoder)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)


In [143]:
check = loaded_one_hot_encoder.fit_transform(integer_encoded)

In [149]:
np.argmax(check[300])

38

In [148]:
np.argmax(onehot_encoded[300])

38