In [1]:
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [10]:
def check_r2(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    for n in range(10, 21):
        kmeans = KMeans(n_clusters=n, n_init=5)
        labels_train = kmeans.fit_predict(X_train)
        labels_test = kmeans.predict(X_test)
        
        y_test_pred_all = np.empty((0,))
        y_test_all = np.empty((0,))
        for idx in range(n):

            clf = RandomForestRegressor(n_estimators=200, max_depth=6, max_features=5)

            clf.fit(X_train.loc[labels_train == idx], y_train[labels_train == idx]['Price'])
            y_test_pred = clf.predict(X_test.loc[labels_test == idx])

            y_test_pred_all = np.hstack([y_test_pred_all, y_test_pred])
            y_test_all = np.hstack([y_test_all, y_test[labels_test == idx]['Price']])

        r2 = r2_score(y_test_all, y_test_pred_all)
        print(f'For {n} clusters, r2 score: {r2}')

In [3]:
train = pd.read_csv('data_housing_model/train_housing_prepared')
test = pd.read_csv('data_housing_model/test_housing_prepared')
train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)

In [5]:
train_standardized = pd.read_csv('data_housing_model/train_standardized_housing_prepared')
test_standardized = pd.read_csv('data_housing_model/test_standardized_housing_prepared')
train_standardized.set_index('Id', inplace=True)
test_standardized.set_index('Id', inplace=True)

In [6]:
train_normalized = pd.read_csv('data_housing_model/train_normalized_housing_prepared')
test_normalized = pd.read_csv('data_housing_model/test_normalized_housing_prepared')
train_normalized.set_index('Id', inplace=True)
test_normalized.set_index('Id', inplace=True)

In [7]:
train_scaled = pd.read_csv('data_housing_model/train_scaled_housing_prepared')
test_scaled = pd.read_csv('data_housing_model/test_scaled_housing_prepared')
train_scaled.set_index('Id', inplace=True)
test_scaled.set_index('Id', inplace=True)

In [8]:
y = pd.read_csv('data_housing_model/housing_y')
y.drop('Id', axis=1, inplace=True)

In [11]:
check_r2(train, y)

For 10 clusters, r2 score: 0.6855340387855915
For 11 clusters, r2 score: 0.6864916507232922
For 12 clusters, r2 score: 0.685760317541176
For 13 clusters, r2 score: 0.6894060839702139
For 14 clusters, r2 score: 0.6880529104629021
For 15 clusters, r2 score: 0.6907914896872297
For 16 clusters, r2 score: 0.6865978399431192
For 17 clusters, r2 score: 0.6891165146879302
For 18 clusters, r2 score: 0.6861665667724139
For 19 clusters, r2 score: 0.6915061302858141
For 20 clusters, r2 score: 0.6901737015337474


In [12]:
check_r2(train_standardized, y)

For 10 clusters, r2 score: 0.713668029191733
For 11 clusters, r2 score: 0.70885545694975
For 12 clusters, r2 score: 0.7072120481768964
For 13 clusters, r2 score: 0.7076116445086065
For 14 clusters, r2 score: 0.7108469207800681
For 15 clusters, r2 score: 0.7074216137057353
For 16 clusters, r2 score: 0.7090752580847535
For 17 clusters, r2 score: 0.7087502992037831
For 18 clusters, r2 score: 0.7049437557765876
For 19 clusters, r2 score: 0.7102714729608868
For 20 clusters, r2 score: 0.7060010472294858


In [13]:
check_r2(train_normalized, y)

For 10 clusters, r2 score: 0.713410511986543
For 11 clusters, r2 score: 0.7186357375921921
For 12 clusters, r2 score: 0.7093319872753192
For 13 clusters, r2 score: 0.7136609792701064
For 14 clusters, r2 score: 0.719050402040391
For 15 clusters, r2 score: 0.715777866310706
For 16 clusters, r2 score: 0.7166220680956289
For 17 clusters, r2 score: 0.7175259512053153
For 18 clusters, r2 score: 0.7142802165217135
For 19 clusters, r2 score: 0.7151302639315928
For 20 clusters, r2 score: 0.7170946131599676


In [14]:
check_r2(train_scaled, y)

For 10 clusters, r2 score: 0.6826633348485136
For 11 clusters, r2 score: 0.6798148298507443
For 12 clusters, r2 score: 0.6828782020622726
For 13 clusters, r2 score: 0.6781218984289983
For 14 clusters, r2 score: 0.6822168260198892
For 15 clusters, r2 score: 0.6882487083959709
For 16 clusters, r2 score: 0.683282050048366
For 17 clusters, r2 score: 0.6781866438719182
For 18 clusters, r2 score: 0.6759673179742034
For 19 clusters, r2 score: 0.6825932866341644
For 20 clusters, r2 score: 0.6851030349804308


In [18]:
def predict_test(train, test, y, n_clusters, name):
    kmeans = KMeans(n_clusters=n_clusters, n_init=5)
    labels_train = kmeans.fit_predict(train)
    labels_test = kmeans.predict(test)
        
    y_test_pred_all = np.empty((0,))
    y_id_all = np.empty((0,))
    
    for idx in range(n_clusters):

        clf = RandomForestRegressor(n_estimators=200, max_depth=6, max_features=5)


        clf.fit(train.loc[labels_train == idx], y[labels_train == idx]['Price'])
        y_test_pred = clf.predict(test.loc[labels_test == idx])

        y_test_pred_all = np.hstack([y_test_pred_all, y_test_pred])        
        y_id_all = np.hstack([y_id_all, test.loc[labels_test == idx].index])

        
    predict = pd.concat([pd.DataFrame(y_id_all, columns=['Id']), pd.DataFrame(y_test_pred_all, columns=['Price'])], axis=1)
    id_test = pd.DataFrame(test.index)
    
    result = id_test.merge(predict, on='Id')

In [19]:
predict_test(train, test, y, 19, 'train')

In [20]:
predict_test(train_standardized, test_standardized, y, 10, 'train_standardized')

In [21]:
predict_test(train_normalized, test_normalized, y, 14, 'train_normalized')

In [22]:
predict_test(train_scaled, test_scaled, y, 15, 'train_scaled')