# Kaggle Competiton
- https://www.kaggle.com/competitions/2019-2nd-ml-month-with-kakr/overview
- Note : [Link](https://www.notion.so/parkjaeyoung/Kaggle-4147f4c9dd0b43e284d697c1cb6d7875?pvs=4https://www.notion.so/parkjaeyoung/Kaggle-4147f4c9dd0b43e284d697c1cb6d7875?pvs=4)

# Library 및 Data Load

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

import xgboost as xgb
from xgboost import XGBRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor


from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline


In [2]:
pd.options.display.max_rows = 100
pd.set_option("display.max_rows", 100)

# Data Load

In [27]:
sub = pd.read_csv('sub.csv',index_col=0)
x = pd.read_csv('x.csv',index_col=0)
y = pd.read_csv('y.csv',index_col=0)   # Log Scaled

y = y['price_logscaled'].to_list()

In [32]:
print(x.shape)
print(len(y))
print(sub.shape)

(15035, 16)
15035
(6468, 16)


# Grid Search (LightGBM)

#### LightGBM
  - Best Parameters: {'colsample_bytree': 0.7, 'max_depth': 15, 'min_split_gain': 0.3, 'n_estimators': 400, 'num_leaves': 50, 'reg_alpha': 1.1, 'reg_lambda': 1.3, 'subsample': 0.9, 'subsample_freq': 20}
  - Best Score: 0.030046641343574966


In [None]:

# 탐색할 파라미터 그리드 생성
param_grid = {
     'n_estimators': [300,400,500,700],
    'colsample_bytree': [0.6,0.7, 0.8],
    'max_depth': [10,15,20],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.2,0.3],
    'subsample': [0.9],
    'subsample_freq': [20]
}


In [None]:
gbm_model = lgb.LGBMRegressor(random_state=36)

In [None]:
gbm_grid_search = GridSearchCV(gbm_model, param_grid, cv=5,scoring='neg_mean_squared_error', n_jobs=-1)
gbm_grid_search.fit(x, y)

In [None]:
# 최적 파라미터와 최적 점수 출력
print("Best Parameters:", gbm_grid_search.best_params_)
print("Best Score:", -gbm_grid_search.best_score_)

Best Parameters: {'colsample_bytree': 0.6, 'max_depth': 10, 'min_split_gain': 0, 'n_estimators': 300, 'num_leaves': 50, 'reg_alpha': 1, 'reg_lambda': 1.3, 'subsample': 0.9, 'subsample_freq': 20}
Best Score: 0.025975293963223105


# Modeling

In [None]:
best_gbm_parameter= {'colsample_bytree': 0.7, 
                     'max_depth': 15, 
                     'min_split_gain': 0.3,
                     'n_estimators': 400, 
                     'num_leaves': 50, 
                     'reg_alpha': 1.1, 
                     'reg_lambda': 1.3, 
                     'subsample': 0.9, 
                     'subsample_freq': 20}

In [None]:
#gbm_model = gbm_grid_search.best_estimator_

In [None]:
gbm_model = lgb.LGBMRegressor(random_state=36, n_jobs=-1, **best_gbm_parameter)
gbm_model.fit(x, y)

In [None]:
y_pred = gbm_model.predict(sub)

In [None]:
y_pred = np.expm1(y_pred)

In [None]:
y_pred

array([ 505130.07598005,  497950.85272783, 1275140.57589858, ...,
        469077.752153  ,  325265.07499629,  460275.88425493])

### Make Submission

회귀 모델의 경우에는 cross_val_score 함수가 R<sup>2</sup>를 반환합니다.<br>
R<sup>2</sup> 값이 1에 가까울수록 모델이 데이터를 잘 표현함을 나타냅니다. 3개 트리 모델이 상당히 훈련 데이터에 대해 괜찮은 성능을 보여주고 있습니다.<br> 훈련 데이터셋으로 3개 모델을 학습시키고, Average Blending을 통해 제출 결과를 만들겠습니다.

In [None]:
sub_final = pd.DataFrame(data={'id':test_id,'price':y_pred})

In [None]:
sub_final.to_csv('submission.csv', index=False)

# Score