In [8]:
# previous semistor
# with outliers
# instances: 535, features: 6
# without parameter tuning

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

df = pd.read_csv("../data_set_535_ano_sankee_six.csv")

print(df.shape)

X = df.drop('price', axis = 1)
y = df['price']

model_params = {
    'lr': {
        'model': LinearRegression(),
        'params': {}
    },
    'rf': {
        'model': RandomForestRegressor(),
        'params': {}
    },
    'knn': {
        'model': KNeighborsRegressor(),
        'params': {}
    }
}

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

(535, 6)


Unnamed: 0,model,best_score,best_params
0,lr,-0.017959,{}
1,rf,0.404056,{}
2,knn,0.245178,{}


In [13]:
# previous semistor
# with outliers
# instances: 535, features: 6
# with parameter tuning

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

df = pd.read_csv("../data_set_535_ano_sankee_six.csv")

print(df.shape)

X = df.drop('price', axis = 1)
y = df['price']

model_params = {
    'lr': {
        'model': LinearRegression(),
        'params': {}
    },
    'rf': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [10, 20, 50, 70, 100],
            'max_features': [1],
            'max_depth': [1, 2, 5, 10, 15, 20, 25]
        }
    },
    'knn': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [5, 10, 15, 25, 30, 50, 100]
        }
    }
}

scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df.to_csv('first_semister_result_with_outliers.csv', index = False)
df

(535, 6)


Unnamed: 0,model,best_score,best_params
0,lr,-0.017959,{}
1,rf,0.468971,"{'max_depth': 15, 'max_features': 1, 'n_estima..."
2,knn,0.26646,{'n_neighbors': 50}


In [14]:
df = pd.read_csv("../data_set_535_ano_sankee_six.csv")

print(df.shape)

X = df.drop('price', axis = 1)
y = df['price']

score_lr = cross_val_score(LinearRegression(), X, y, cv=5)
score_rf = cross_val_score(RandomForestRegressor(max_depth=15, max_features=1, n_estimators=70), X, y, cv=5)
score_knn = cross_val_score(KNeighborsRegressor(n_neighbors=50), X, y, cv=5)

print('LR: ', np.mean(score_lr))
print('RF: ', np.mean(score_rf))
print('KNN: ', np.mean(score_knn))

(535, 6)
LR:  -0.017958616178734017
RF:  0.4743602089514427
KNN:  0.26646025708900745
