In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import util_michael as util_m
from util import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb

In [3]:
train = pd.read_csv('./data/cleaner_train.csv', na_filter=False) 
test = pd.read_csv('./data/cleaner_test.csv', na_filter=False)
Y = train['SalePrice']
train.drop(columns=['SalePrice'], inplace=True)
train.shape, test.shape, Y.shape

((1448, 126), (1459, 126), (1448,))

In [4]:
X = np.array(train.drop(columns=['Id']))
Xt = np.array(test.drop(columns=['Id']))
Y = np.log(np.array(Y))

In [5]:
Xscale, Xtscale = util_m.apply_scale(X, Xt)

## PCA

In [6]:
from sklearn.decomposition import PCA
def _eval_components(Z: np.array, comps: set=None):
    if comps is None: 
        comp_from = 1
        comp_to = Z.shape[1]
    else:
        comp_from = comps[0]
        comp_to = comps[1]
    print('Num of components\tCumulative sum')
    for i in range(comp_from, comp_to, 1):
        pca = PCA(n_components=i).fit(Z)
        # print('Variance ratio = ', pca.explained_variance_ratio_)
        print(pca.n_components_, '\t\t\t', sum(pca.explained_variance_ratio_))

_eval_components(Xscale, (83, 94))

Num of components	Cumulative sum
83 			 0.976789994711532
84 			 0.978573532144838
85 			 0.9802498162536578
86 			 0.9818462094488191
87 			 0.9832744775468437
88 			 0.984673723698439
89 			 0.986008417589342
90 			 0.9873179764733694
91 			 0.9885890267903121
92 			 0.9896829015386821
93 			 0.9906913181947847


In [7]:
pca = PCA(n_components=84).fit(Xscale)
Xpca = pca.transform(Xscale)
Xtpca = pca.transform(Xtscale)

# Modelling

In [8]:
def evaluate_regressor(model, X, Y, name=None, nruns=200, other_metric=None):
    r2, mse, extra = [], [], []
    for j in range(nruns):
        xtrain, xtest, ytrain, ytest = train_test_split(X, Y)
        model.fit(xtrain, ytrain)
        YP = model.predict(xtest)
        r2.append(r2_score(YP, ytest))
        mse.append(mean_squared_error(YP, ytest))
        if other_metric!=None:
            keep_positives = YP >= 0
            extra.append(other_metric['call'](YP[keep_positives], ytest[keep_positives]))
    print("Runs:\t\t", nruns)
    print("Mean R2:\t", np.mean(r2), "\nSTD R2:\t\t", np.std(r2))
    print("Mean MSE:\t", np.mean(mse), "\nSTD MSE:\t", np.std(mse))
    if other_metric!=None: print(other_metric['name']+":\t\t", np.mean(extra))
    plt.hist(r2)
    plt.title("R2 Histogram - "+name)
    plt.xlim(0, 1)
    
def show_grid_results(grid_search, all=True):
    print('Best parameters:\n', grid_search.best_params_, '\n', grid_search.best_score_, '\n')
    if all:
        cvres = grid_search.cv_results_
        for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
            print(mean_score, params)

## RandomForest

In [9]:
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    {
        'bootstrap': [True], 'criterion': ['mse'], 'max_features': ['auto'], 
        'n_estimators':['warn'], 'max_leaf_nodes': [None],
#         'max_depth': None,
    }
]
model = RandomForestRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(Xscale, Y)
show_grid_results(grid_search)

Best parameters:
 {'bootstrap': True, 'criterion': 'mse', 'max_features': 'auto', 'max_leaf_nodes': None, 'n_estimators': 'warn'} 
 -0.02014909054683932 

-0.02014909054683932 {'bootstrap': True, 'criterion': 'mse', 'max_features': 'auto', 'max_leaf_nodes': None, 'n_estimators': 'warn'}


In [10]:
best_model = grid_search.best_estimator_

# Predict the test data

In [11]:
Yt = np.power(np.e, best_model.predict(Xtscale))

In [14]:
output = pd.concat([test['Id'], pd.Series(Yt)], axis=1)
output.columns=['Id', 'SalePrice']
output.shape

(1459, 2)

In [15]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,128317.437381
1,1462,150205.614372
2,1463,186796.264185
3,1464,190228.334889
4,1465,198984.399836


In [17]:
import datetime
x = datetime.datetime.now()
_result_file = 'results/result_'+x.strftime("%b-%d %H %M %S")+'.csv'
output.to_csv(_result_file, index=False)

    06/29 First try
        Kaggle Score 0.15143 with 126 columns using RandomForest (without intensive search of best params/modeling)
       