In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn import linear_model

import seaborn as sns
import pickle

<h1>Load data sets

In [2]:
df_range_3 =  pd.read_csv('range_3.csv')
df_range_1_cluster_1 = pd.read_csv('range_1_cluster_1.csv')
df_range_1_cluster_2 = pd.read_csv('range_1_cluster_2.csv')
df_range_1_cluster_3 = pd.read_csv('range_1_cluster_3.csv')
df_range_2_cluster_1 = pd.read_csv('range_2_cluster_1.csv')
df_range_2_cluster_2 = pd.read_csv('range_2_cluter_2.csv')
df_range_2_cluster_3 = pd.read_csv('range_2_cluter_3.csv')

In [3]:
important_features = ['Beds','Baths','h_l_ratio','Bed Size','beds_bath_ratio', 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39]
important_features = [ str(f) for f in important_features]

In [19]:
len(important_features)

45

In [4]:
params_grid = { 
    'n_estimators': list(range(10, 100, 3)),
    'max_features': [1, 2, 3, 4, 5, 6,7, 8, 9, 10, 13, 14, 15,'auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['mse', 'mae']
}

In [5]:
def tune_params(df, params, file_name, cols):
    X_train, X_test, Y_train, Y_test = train_test_split(
        df[cols],
        df['Price'],
        test_size=0.33,
        random_state=42
    )
    grid_search = GridSearchCV(param_grid=params_grid, estimator=RandomForestRegressor(), cv=5, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train,Y_train)
    #Save grid_search obj
    with open(file_name, 'wb') as file:
      pickle.dump(grid_search, file)
    print(grid_search.best_params_)
    return grid_search

In [None]:
tune_params(df_range_3,params_grid, 'range_3', important_features).best_score_

In [None]:
print(tune_params(df_range_1_cluster_1,params_grid, 'range_1_cluster_1', important_features).best_score_)
print(tune_params(df_range_1_cluster_2,params_grid, 'range_1_cluster_2', important_features).best_score_)
print(tune_params(df_range_1_cluster_3,params_grid, 'range_1_cluster_3', important_features).best_score_)
print(tune_params(df_range_2_cluster_1,params_grid, 'range_2_cluster_1', important_features).best_score_)
print(tune_params(df_range_2_cluster_2,params_grid, 'range_2_cluster_2', important_features).best_score_)
print(tune_params(df_range_2_cluster_3,params_grid, 'range_2_cluster_3', important_features).best_score_)

In [7]:
rf_11 = None;
rf_12 = None;
rf_13 = None
rf_21 = None
rf_22 = None
rf_23 = None

In [9]:
def load_model(n_range, n_cluster, model_obj):
    with open('./models/range_'+ str(n_range) + '_cluster_'+str(n_cluster), 'rb') as file:
        model_obj = pickle.load(file)
    return model_obj;
    

In [26]:
rf_11 = load_model(1, 1, rf_11).best_estimator_
rf_12 = load_model(1, 2, rf_12).best_estimator_
rf_13 = load_model(1, 3, rf_13).best_estimator_
rf_21 = load_model(2, 1, rf_21).best_estimator_
rf_22 = load_model(2, 2, rf_22).best_estimator_
rf_23 = load_model(2, 3, rf_23).best_estimator_

In [38]:
rf_11.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 8,
 'max_features': 10,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 16,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [39]:
params_dict = {
    'rf_11': rf_11.get_params(),
    'rf_12': rf_12.get_params(),
    'rf_13': rf_13.get_params(),
    'rf_21': rf_21.get_params(),
    'rf_22': rf_22.get_params(),
    'rf_23': rf_23.get_params()
}

In [109]:
class HousePredictionModel:
    def __init__(self, data, params_dict):
        self.data = data;
        self.param_11 = params_dict['rf_11']
        self.param_12 = params_dict['rf_12']
        self.param_13 = params_dict['rf_13']
        self.param_21 = params_dict['rf_21']
        self.param_22 = params_dict['rf_22']
        self.param_23 = params_dict['rf_23']
        self.rf_11 = None;
        self.rf_12 = None;
        self.rf_13 = None
        self.rf_21 = None;
        self.rf_22 = None;
        self.rf_23 = None;
        
    def predict(self, x_test):
        #rf_11
        pred_11 = self.rf_11.predict(x_test)
        #rf_12
        pred_12 = self.rf_12.predict(x_test)
        #rf_13
        pred_13 = self.rf_13.predict(x_test)
        #rf_21
        pred_21 = self.rf_21.predict(x_test)
        #rf_22
        pred_22 = self.rf_22.predict(x_test)
        #rff_23
        pred_23 = self.rf_23.predict(x_test)
        return {
            'rf_11': pred_11,
            'rf_12': pred_12,
            'rf_13': pred_13,
            'rf_21': pred_21,
            'rf_22': pred_22,
            'rf_23': pred_23
        }
        
        
    def train(self):
        #rf_11
        if (self.rf_11 is None):
            self.rf_11 = RandomForestRegressor(**self.param_11)
            self.rf_11.fit(self.data['rf_11'][0], self.data['rf_11'][2])
            pred_11 = self.rf_11.predict(self.data['rf_11'][1])
            rmse_11 = np.sqrt(((pred_11 - self.data['rf_11'][3]) ** 2).mean())
            r2_11 = self.rf_11.score(self.data['rf_11'][1], self.data['rf_11'][3])
             #print metrics of models
            print('-----------------------------------------------------------------------------------')
            print('rf_1_1');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_11))
            print('r2: ' + str(r2_11))
            with open('rf_11', 'wb') as file:
                pickle.dump(rf_11, file)
        else:
            with open('rf_11', 'rb') as file:
                self.rf_11 = pickle.load(file)
        #rf_12
        if (self.rf_12 is None):
            self.rf_12 = RandomForestRegressor(**self.param_12)
            self.rf_12.fit(self.data['rf_12'][0], self.data['rf_12'][2])
            pred_12 = self.rf_12.predict(self.data['rf_12'][1])
            rmse_12 = np.sqrt(((pred_12 - self.data['rf_12'][3]) ** 2).mean())
            r2_12 = self.rf_12.score(self.data['rf_12'][1], self.data['rf_12'][3])
            print('-----------------------------------------------------------------------------------')
            print('rf_1_2');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_12))
            print('r2: ' + str(r2_12))
            with open('rf_12', 'wb') as file:
                pickle.dump(rf_12, file)
        else:
            with open('rf_12', 'rb') as file:
                self.rf_12 = pickle.load(file)
        
        #rf_13
        if(self.rf_13 is None):
            self.rf_13 = RandomForestRegressor(**self.param_13)
            self.rf_13.fit(self.data['rf_13'][0], self.data['rf_13'][2])
            pred_13 = self.rf_13.predict(self.data['rf_13'][1])
            rmse_13 = np.sqrt(((pred_13 - self.data['rf_13'][3]) ** 2).mean())
            r2_13 = self.rf_13.score(self.data['rf_13'][1], self.data['rf_13'][3])
            print('-----------------------------------------------------------------------------------')
            print('rf_1_3');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_13))
            print('r2: ' + str(r2_13))
            with open('rf_13', 'wb') as file:
                pickle.dump(rf_13, file)
        else:
            with open('rf_13', 'rb') as file:
                self.rf_13 = pickle.load(file)
        
        #rf_21
        if(self.rf_21 is None):
            self.rf_21 = RandomForestRegressor(**self.param_21)
            self.rf_21.fit(self.data['rf_21'][0], self.data['rf_21'][2])
            pred_21 = self.rf_21.predict(self.data['rf_21'][1])
            rmse_21 = np.sqrt(((pred_21 - self.data['rf_21'][3]) ** 2).mean())
            r2_21 = self.rf_21.score(self.data['rf_21'][1], self.data['rf_21'][3])
            print('-----------------------------------------------------------------------------------')
            print('rf_2_1');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_21))
            print('r2: ' + str(r2_21))
            with open('rf_21', 'wb') as file:
                pickle.dump(rf_21, file)
        else:
            with open('rf_21', 'rb') as file:
                self.rf_21 = pickle.load(file) 
        #rf_22
        if(self.rf_22 is None):
            self.rf_22 = RandomForestRegressor(**self.param_22)
            self.rf_22.fit(self.data['rf_22'][0], self.data['rf_22'][2])
            pred_22 = self.rf_22.predict(self.data['rf_22'][1])
            rmse_22 = np.sqrt(((pred_22 - self.data['rf_22'][3]) ** 2).mean())
            r2_22 = self.rf_22.score(self.data['rf_22'][1], self.data['rf_22'][3])
            print('-----------------------------------------------------------------------------------')
            print('rf_2_2');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_22))
            print('r2: ' + str(r2_22))
            with open('rf_22', 'wb') as file:
                pickle.dump(rf_22, file)
        else:
            with open('rf_22', 'rb') as file:
                self.rf_22 = pickle.load(file) 
        #rf_23
        if(self.rf_23 is None):
            self.rf_23 = RandomForestRegressor(**self.param_23)
            self.rf_23.fit(self.data['rf_23'][0], self.data['rf_23'][2])
            pred_23 = self.rf_23.predict(self.data['rf_23'][1])
            rmse_23 = np.sqrt(((pred_23 - self.data['rf_23'][3]) ** 2).mean())
            r2_23 = self.rf_23.score(self.data['rf_23'][1], self.data['rf_23'][3])
            print('-----------------------------------------------------------------------------------')
            print('rf_2_3');
            print('-----------------------------------------------------------------------------------')
            print('rmse: ' + str(rmse_23))
            print('r2: ' + str(r2_23))
            with open('rf_23', 'wb') as file:
                pickle.dump(rf_23, file)
        else:
            with open('rf_23', 'rb') as file:
                self.rf_23 = pickle.load(file) 
       
        
       
        
        
      

In [110]:
important_features = ['Beds','Baths','h_l_ratio','Bed Size','beds_bath_ratio', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39']


In [111]:
features_dict = {
    'rf_11': important_features,
    'rf_12': important_features,
    'rf_13': important_features,
    'rf_21': important_features,
    'rf_22': important_features,
    'rf_23': important_features
}

In [112]:
data = {
    'rf_11': (train_test_split(
            df_range_1_cluster_1[features_dict['rf_11']],
            df_range_1_cluster_1['Price'],
            test_size=0.33,
            random_state=42
        )),
    'rf_12': (train_test_split(
            df_range_1_cluster_2[features_dict['rf_12']],
            df_range_1_cluster_2['Price'],
            test_size=0.33,
            random_state=42
        )),
    'rf_13': (train_test_split(
            df_range_1_cluster_3[features_dict['rf_13']],
            df_range_1_cluster_3['Price'],
            test_size=0.33,
            random_state=42
        )),
    'rf_21': (train_test_split(
            df_range_2_cluster_1[features_dict['rf_21']],
            df_range_2_cluster_1['Price'],
            test_size=0.33,
            random_state=42
        )),
    'rf_22': (train_test_split(
            df_range_2_cluster_2[features_dict['rf_22']],
            df_range_2_cluster_2['Price'],
            test_size=0.33,
            random_state=42
        )),
    'rf_23': (train_test_split(
            df_range_2_cluster_3[features_dict['rf_23']],
            df_range_2_cluster_3['Price'],
            test_size=0.33,
            random_state=42
        ))
}

In [113]:
data['rf_11'][0]

Unnamed: 0,Beds,Baths,h_l_ratio,Bed Size,beds_bath_ratio,0,1,2,3,4,...,30,31,32,33,34,35,36,37,38,39
264,3,2,0.366610,349.333333,1.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
506,2,1,0.367309,600.000000,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
572,3,2,0.500877,500.000000,1.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
346,2,1,0.459137,750.000000,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,2,1,1.000003,912.040000,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2,1,0.821549,335.500000,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,3,1,0.795837,650.000000,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,3,1,0.237826,600.000000,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,3,1,0.440771,400.000000,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
model = HousePredictionModel(
    params_dict=params_dict,
    data=data
)

In [115]:
model.train()

-----------------------------------------------------------------------------------
rf_1_1
-----------------------------------------------------------------------------------
rmse: 1498063.6749245995
r2: 0.2923057264056126
-----------------------------------------------------------------------------------
rf_1_2
-----------------------------------------------------------------------------------
rmse: 1188645.6798589274
r2: 0.23555253310205138
-----------------------------------------------------------------------------------
rf_1_3
-----------------------------------------------------------------------------------
rmse: 1150491.0605184173
r2: 0.3911600182064555
-----------------------------------------------------------------------------------
rf_2_1
-----------------------------------------------------------------------------------
rmse: 1165943.7306660283
r2: 0.31579064987745376
-----------------------------------------------------------------------------------
rf_2_2
---------------

In [116]:
model.train()

In [117]:
model.predict(data['rf_11'][3])

ValueError: Expected 2D array, got 1D array instead:
array=[6000000. 9000000. 8700000. 9200000. 5500000. 7500000. 7900000. 6800000.
 8800000. 9200000. 5600000. 4400000. 8900000. 8000000. 7900000. 8500000.
 7000000. 3000000. 6800000. 8000000. 7000000. 3500000. 7500000. 4000000.
 5700000. 7500000. 6500000. 9000000. 4100000. 8800000. 8200000. 8700000.
 7500000. 7000000. 6500000. 9000000. 7200000. 7900000. 6900000. 7500000.
 8300000. 7800000. 8300000. 4700000. 6100000. 5600000. 9000000. 3600000.
 8000000. 6000000. 6500000. 2700000. 7500000. 8000000. 8000000. 7800000.
 7000000. 6500000. 5700000. 7000000. 6500000. 6000000. 8500000. 3000000.
 7500000. 3500000. 7300000. 9000000. 8500000. 8000000. 2500000. 5200000.
 3950000. 1500000. 5500000. 5800000. 9300000. 6900000. 5800000. 4200000.
 3600000. 9000000. 7400000. 8000000. 3500000. 7200000. 6900000. 7800000.
 6800000. 7000000. 6000000. 6800000. 5800000. 6000000. 2500000. 8500000.
 8000000. 8500000. 8700000. 4500000. 7000000. 3000000. 4000000. 5500000.
 7500000. 9000000. 8800000. 8600000. 7950000. 8400000. 5000000. 7800000.
 5000000. 7000000. 6700000. 5500000. 8500000. 9000000. 4300000. 5300000.
 6300000. 8500000. 6500000. 7200000. 7800000. 6800000. 8000000. 9000000.
 8700000. 8800000. 4000000. 7900000. 3600000. 3300000. 6800000. 8500000.
 9000000. 7500000. 5000000. 6500000. 8000000. 8000000. 4500000. 8000000.
 8500000. 8500000. 7900000. 4500000. 7425000. 5900000. 8500000. 5500000.
 7900000. 5700000. 8000000. 6500000. 8500000. 9000000. 5200000. 7000000.
 8000000. 8000000. 5300000. 3600000. 8500000. 5600000. 4000000. 8500000.
 8700000. 7000000. 6200000. 6900000. 9000000. 6500000. 6600000. 7000000.
 7800000. 9000000. 8200000. 6850000. 4500000. 8000000. 7800000. 4900000.
 5500000. 9000000. 8000000. 7900000. 5700000. 6700000. 5500000. 7500000.
 3500000. 6300000. 8500000. 2600000. 5000000. 6500000. 6500000. 7500000.
 4500000. 8000000. 8300000. 8500000. 4800000. 5500000. 2700000. 5900000.
 7900000. 5900000. 6800000. 9000000. 6400000. 8600000. 8300000. 2800000.
 4000000. 8000000. 5200000. 9000000. 7000000. 5800000. 7900000. 8000000.
 9000000. 7200000. 8900000. 3700000. 8500000. 1500000. 6500000. 3800000.
 8800000. 8000000. 8750000. 6000000. 5500000. 9000000. 8500000. 6900000.
 5800000. 8000000. 7500000. 6600000. 7800000. 3950000. 4850000. 9000000.
 6600000. 7800000. 8500000. 9000000. 8500000. 7800000. 9000000. 5800000.
 2900000. 6800000. 7900000. 6800000. 7900000. 8800000. 8800000. 8000000.
 4600000. 8000000. 7500000. 7500000. 4700000. 8500000. 6500000. 5500000.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.