# Kaggle Competiton
- https://www.kaggle.com/competitions/2019-2nd-ml-month-with-kakr/overview
- Note : [Link](https://www.notion.so/parkjaeyoung/Kaggle-4147f4c9dd0b43e284d697c1cb6d7875?pvs=4https://www.notion.so/parkjaeyoung/Kaggle-4147f4c9dd0b43e284d697c1cb6d7875?pvs=4)

# Library 및 Data Load

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

import xgboost as xgb
from xgboost import XGBRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor


from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline


In [3]:
pd.options.display.max_rows = 100
pd.set_option("display.max_rows", 100)

-----------

# Data Load 

In [4]:
sub = pd.read_csv('sub.csv',index_col=0)
x = pd.read_csv('x.csv',index_col=0)
y = pd.read_csv('y.csv',index_col=0)   # Log Scaled
test_id = pd.read_csv('test_id.csv',index_col=0)

y = y['price_logscaled'].to_list()
test_id = test_id['id'].to_list()

In [5]:
print(x.shape)
print(len(y))
print(sub.shape)

(15035, 16)
15035
(6468, 16)


## Modeling

In [6]:
best_xgb_parameter =  {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.8} 
best_gbm_parameter =  {'colsample_bytree': 0.6, 'max_depth': 10, 'min_split_gain': 0, 'n_estimators': 300, 'num_leaves': 50, 'reg_alpha': 1, 'reg_lambda': 1.3, 'subsample': 0.9, 'subsample_freq': 20}
best_gbr_parameter =  {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8 }


gbm_model = lgb.LGBMRegressor(random_state=36, n_jobs=-1, **best_gbm_parameter)
xgb_model = xgb.XGBRegressor(random_state=36, n_jobs=-1, **best_xgb_parameter)
gbr_model = GradientBoostingRegressor(random_state=36, **best_gbr_parameter)

models = [{'model':xgb_model, 'name':'XGBoost'},
          {'model':gbm_model, 'name':'LightGBM'},
          {'model':gbr_model, 'name':'GradientBoosting'}]

In [7]:
def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

In [8]:
get_cv_score(models)

Model: XGBoost, CV score:0.9080
Model: LightGBM, CV score:0.9064
Model: GradientBoosting, CV score:0.9046


### Average Blending

In [9]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)

In [10]:
y_pred = AveragingBlending(models, x, y, sub)

In [11]:
y_pred = np.expm1(y_pred)

### Make Submission

회귀 모델의 경우에는 cross_val_score 함수가 R<sup>2</sup>를 반환합니다.<br>
R<sup>2</sup> 값이 1에 가까울수록 모델이 데이터를 잘 표현함을 나타냅니다. 3개 트리 모델이 상당히 훈련 데이터에 대해 괜찮은 성능을 보여주고 있습니다.<br> 훈련 데이터셋으로 3개 모델을 학습시키고, Average Blending을 통해 제출 결과를 만들겠습니다.

In [12]:
sub_final = pd.DataFrame(data={'id':test_id,'price':y_pred})

In [13]:
sub_final.to_csv('submission.csv', index=False)

# Score

 - Score: 107517.21249
 - Private score: 112048.55987