In [2]:
#new collected data
# instances: 7577, features: 6

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

df = pd.read_csv("../house_data_set_cleaned_3.csv")

X = df.drop('price', axis = 1)
y = df['price']

model_params = {
    'lr': {
        'model': LinearRegression(),
        'params': {}
    },
    'rf': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10, 20, 50, 70, 100],
            'max_features': [1, 2, 3, 4, 5],
            'max_depth': [1, 2, 5, 10, 15, 20, 25]
        }
    },
    'knn': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [5, 10, 15, 25, 30, 50, 100]
        }
    }
}

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df.to_csv('result_7577.csv', index=False)
df

Unnamed: 0,model,best_score,best_params
0,lr,0.408505,{}
1,rf,0.710708,"{'max_depth': 15, 'max_features': 3, 'n_estima..."
2,knn,0.518124,{'n_neighbors': 5}


In [3]:
# one hot encoded locations

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("../house_data_colombo_one_hot_encoded_locations.csv")

X = df.drop('price', axis = 1)
y = df['price']

model_params = {
    'lr': {
        'model': LinearRegression(),
        'params': {}
    },
    'rf': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10, 20, 50, 70, 100],
            'max_features': [1, 2, 3, 4, 5],
            'max_depth': [1, 2, 5, 10, 15, 20, 25]
        }
    },
    'knn': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [5, 10, 15, 25, 30, 50, 100]
        }
    }
}

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

print(scores)
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,lr,0.598509,{}
1,rf,0.728939,"{'max_depth': 25, 'max_features': 4, 'n_estima..."
2,knn,0.435482,{'n_neighbors': 5}


In [4]:
# removed outliers by clustering

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("../house_data_colombo_cluster1.csv")

X = df.drop('price', axis = 1)
y = df['price']

model_params = {
    'lr': {
        'model': LinearRegression(),
        'params': {}
    },
    'rf': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10, 20, 50, 70, 100],
            'max_features': [1, 2, 3, 4, 5],
            'max_depth': [1, 2, 5, 10, 15, 20, 25]
        }
    },
    'knn': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [5, 10, 15, 25, 30, 50, 100]
        }
    }
}

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

print(scores)
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

[{'model': 'lr', 'best_score': 0.6918110913798157, 'best_params': {}}, {'model': 'rf', 'best_score': 0.7883936083347778, 'best_params': {'max_depth': 25, 'max_features': 4, 'n_estimators': 100}}, {'model': 'knn', 'best_score': 0.590387794056465, 'best_params': {'n_neighbors': 5}}]


Unnamed: 0,model,best_score,best_params
0,lr,0.691811,{}
1,rf,0.788394,"{'max_depth': 25, 'max_features': 4, 'n_estima..."
2,knn,0.590388,{'n_neighbors': 5}


In [9]:
# result with tuned parameters

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
df = pd.read_csv("../house_data_colombo_cluster1.csv")


X = df.drop('price', axis = 1)
y = df['price']

score_lr = cross_val_score(LinearRegression(), X, y, cv=10)
score_rf = cross_val_score(RandomForestRegressor(max_depth=25, max_features=4, n_estimators=100), X, y, cv=10)
score_knn = cross_val_score(KNeighborsRegressor(n_neighbors=5), X, y, cv=10)

print('LR: ', np.mean(score_lr))
print('RF: ', np.mean(score_rf))
print('KNN: ', np.mean(score_knn))

LR:  0.6918110913798157
RF:  0.7873726662643744
KNN:  0.590387794056465
