In [1]:
# 라이브러리 import
import os
import warnings
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.model_selection import (
        KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, RepeatedKFold,
)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import (
    BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,
)

warnings.filterwarnings("ignore")

In [2]:
data_list = os.listdir("./Data")
data_list

['submission_baseline_rf.csv',
 'submission.csv',
 'FIFA_train.csv',
 'FIFA_test.csv',
 'bagging.png',
 'EDA_train.csv',
 'train_log.ftr',
 'EDA_test.csv']

In [3]:
train = pd.read_csv("./Data/" + data_list[5])
test = pd.read_csv("./Data/" + data_list[7])

In [5]:
# 데이터 x,y 분리
x_train = train.drop("value", axis=1)
y_train = train["value"]

In [6]:
kfold = KFold(n_splits=7, shuffle=True, random_state=120)

In [7]:
# 회귀에는 사용 x, 분류에만 사용
#stratifiedkfold = StratifiedKFold()

In [8]:
# train set, validation set의 index를 반환 해줌.
for i, (t,v) in enumerate(kfold.split (train)):
    
    # train, val 분리
    trn = train.iloc[t]
    val = train.iloc[v]
    
    # x,y 분리
    x_tr = trn.drop("value", axis=1)
    y_tr = trn["value"]
    
    x_val = val.drop("value", axis=1)
    y_val = val["value"]
    
    # 모델 학습
    rf = RandomForestRegressor(n_estimators=300, random_state=130)
    rf.fit(x_tr, y_tr)
    
    # 예측
    pred = rf.predict(x_val)
    pred = np.expm1(pred)
    
    y_val = np.expm1(y_val)
    
    # rmse
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    print(f"{i+1}번 모델 rmse : {rmse}")

1번 모델 rmse : 775235.2091820899
2번 모델 rmse : 718527.448654653
3번 모델 rmse : 707462.5027556329
4번 모델 rmse : 518850.39267032
5번 모델 rmse : 669402.0106647008
6번 모델 rmse : 823328.4820529153
7번 모델 rmse : 1465898.600772178


In [9]:
# GridSearch
rf = RandomForestRegressor(random_state=120)

In [10]:
#parameters
params = {
    "n_estimators" : [300, 400, 500],
    "min_samples_split" : [2,3,4],
    "min_samples_leaf" : [1,2,3]
}

In [11]:
cv = KFold(n_splits=3, shuffle=True, random_state=120)

In [12]:
grid = GridSearchCV(
    rf, 
    param_grid= params, 
    cv=cv, 
    scoring='neg_mean_squared_error',
    verbose=2, 
    n_jobs=-1
)

In [13]:
grid.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.4min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=120, shuffle=True),
             estimator=RandomForestRegressor(random_state=120), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [300, 400, 500]},
             scoring='neg_mean_squared_error', verbose=2)

In [14]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_index_)
print(grid.best_score_)

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
RandomForestRegressor(n_estimators=300, random_state=120)
0
-0.010073944373250961


In [15]:
RandomForestRegressor(**grid.best_params_)

RandomForestRegressor(n_estimators=300)

In [16]:
# 서치 결과 가장 좋은 estimator로 학습
grid.best_estimator_.fit(x_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=120)

In [17]:
# 학습 후 예측 
grid.best_estimator_.predict(test)

ValueError: Number of features of the model must match the input. Model n_features is 26 and input n_features is 27 

In [None]:
# RandomSearch
rf = RandomForestRegressor(random_state=130)

In [None]:
from scipy.stats import randint

In [None]:
params = {
    "n_estimators" : randint(100,600), # randomint를 import 해서 100~600사이의 숫자를 추출함
    "min_samples_split" : randint(1,8),
    "min_samples_leaf" : randint(1,5)
}

In [None]:
cv = RepeatedKFold(n_splits=3, shuffle=True, random_state=120, n_repeats=3)

In [None]:
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=params,
    cv = cv
    n_iter = 20,
    scoring="neg_mean_squared_error",
    verbose = 1
    n_jobs = -1
)

In [None]:
random_search.fit(x_train, y_train)

In [None]:
print(random_search.best_estimator_)
print(random_search.best_params_)
print(random_search.best_index_)
print(random_search.best_score_)

In [None]:
# 모델 앙상블 
rf_1 = grid.best_estimator_
rf_2 = random_search.best_estimator_

In [None]:
rf_1.fit(x_train, y_train)
rf_2.fit(x_train, y_train)

In [None]:
pred_1 = rf_1.predict(test)
pred_2 = rf_2.predict(test)

In [None]:
# pred1과 pred2에 각각 가중치를 0.5씩 줌 -> 왠만하면 성능이 향상됨.
(pred_1 * 0.5)  + (pred_2 * 0.5)
pred

In [None]:
# numpy의 ranom seed 고정
np.random.see(123)

In [None]:
# 앙상블 모델에 bagging 
prediction_list = [] # 예측값을 저장할 list

for _ in tqdm(range(10)):
    # data_index = x_train.index
    data_index = [ idx for idx in range(x_train.shape[0])] # x_train 길이만큼 index를 저장
    random_index = np.random.choice(data_index, x_train.shape[0], replace=True) # 인덱스를 복원 추출하기 위해 replace = True로 줌
    
    rf = RandomForestRegressor(**random_search.best_params_) # search하여 찾은 가장 좋은 파라미터를 넣음
    rf.fit(x_train.iloc[random_index,], y_train.iloc[random_index,]) # 모델 학습
    
    pred = rf.predict(test)
    pred = np.expm1(pred) # 지수함수 적용
    
    prediction_list.append(pred) # 예측된 값을 list에 넣음 -> 10개 모델 돌면서 다 저장

In [None]:
prediction = []

for idx2 in range(test.shape[0]):
    temp = []
    
    # 각 array line by line으로 평균을 내어 prediction에 저장.
    for idx in range(len(prediction_list)):
        temp.append(prediction_list[idx][idx2])
    prediction.append(np.mean(temp))

In [None]:
prediction_list

In [None]:
# row 방향으로 돌고 있음
prediction_list[0][0]
preediction_list[1][0]

In [None]:
df = pd.DataFrame({
    "p0" : prediction_list[0],
    "p1" : prediction_list[1],
    "p2" : prediction_list[2],
    "p3" : prediction_list[3],
    "p4" : prediction_list[4],
    "p5" : prediction_list[5],
    "p6" : prediction_list[6],
    "p7" : prediction_list[7],
    "p8" : prediction_list[8],
    "p9" : prediction_list[9]
})

df

In [None]:
np.mean[df.iloc[0]]

In [None]:
# 데이터 프레임에 넣지 않고 행별로 평균을 냄.
prediction[0]

In [None]:
len(prediction)

In [None]:
len(test)

배깅할 때 XGBoost, Lightbgm 등의 알고리즘을 여러개 사용하고 그 예측값을 평균내는 것이 더 괜찮나요 ?
아니면 하나의 모델만 가지고 하는게 더 나은가요?  

-> 경험상 Lightbgm만 넣은 것이 가장 좋았음.