In [1]:
import os
import warnings
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

warnings.filterwarnings('ignore')

In [2]:
data_list = os.listdir('./data')

In [3]:
data_list

['FIFA_train.csv',
 'FIFA_test.csv',
 'submission_baseline_rf.csv',
 'submission.csv']

In [55]:
data = pd.read_pickle('./data/data.pkl')

In [56]:
train = pd.read_csv('./data/FIFA_train.csv')
test = pd.read_csv('./data/FIFA_test.csv')

In [57]:
train_data = data[:len(train)]

In [80]:
test_data = data[len(train):]

In [59]:
train_data['value'] = train['value'].map(lambda x: np.log1p(x))

In [72]:
# 데이터 분리
X_train = train_data.drop('value', axis=1)
y_train = train_data['value']

In [60]:
kfold = KFold(n_splits=7, shuffle=True, random_state=42)

In [61]:
# label의 불균형을 억제해준다. 분류모델에서 사용
# stratifiedkfold = StratifiedKFold()

In [63]:
for i, (t, v) in enumerate(kfold.split(train_data)):
    
    # train, val 분리
    trn = train_data.iloc[t]
    val = train_data.iloc[v]
    
    # x, y 분리
    x_tr = trn.drop('value', axis=1)
    y_tr = trn['value']
    
    x_val = val.drop('value', axis=1)
    y_val = val['value']
    
    # 모델 학습
    rf = RandomForestRegressor(n_estimators=300, random_state=42)
    rf.fit(x_tr, y_tr)
    
    # 예측
    pred = rf.predict(x_val)
    pred = np.expm1(pred)
    
    y_val = np.expm1(y_val)
    
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    print(f'{i+1}번 모델 rmse : {rmse}')

1번 모델 rmse : 1540130.3735089768
2번 모델 rmse : 660490.432009333
3번 모델 rmse : 581139.1537062454
4번 모델 rmse : 627107.3483604759
5번 모델 rmse : 717451.5998456837
6번 모델 rmse : 1113042.2976140792
7번 모델 rmse : 513131.6277726523


In [64]:
# GridSearch
rf = RandomForestRegressor(random_state=42)

In [65]:
params = {
    "n_estimators": [300, 400, 500],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3],
}

In [67]:
cv = KFold(n_splits=3, shuffle=True, random_state=42)

In [68]:
grid = GridSearchCV(rf, 
                    param_grid=params, 
                    cv=cv, 
                    scoring="neg_mean_squared_error", 
                    verbose=2, 
                    n_jobs=-1
                   )

In [75]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   47.1s finished


GridSearchCV(cv=KFold(n_splits=3, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [300, 400, 500]},
             scoring='neg_mean_squared_error', verbose=2)

In [76]:
# GridSearch 결과
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)
print(grid.best_index_)

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
RandomForestRegressor(n_estimators=500, random_state=42)
-0.008847779615567816
2


In [77]:
RandomForestRegressor(**grid.best_params_)

RandomForestRegressor(n_estimators=500)

In [78]:
grid.best_estimator_.fit(X_train, y_train)

RandomForestRegressor(n_estimators=500, random_state=42)

In [81]:
# 학습 후 예측
grid.best_estimator_.predict(test_data)

array([17.76030434, 18.16135591, 18.01757089, ..., 10.99114453,
       10.71000716, 10.82011917])

In [82]:
# RandomSearch
rf = RandomForestRegressor(random_state=42)

In [84]:
from scipy.stats import randint

In [85]:
params = {
    "n_estimators": randint(100, 600),
    "min_samples_split": randint(1, 8),
    "min_samples_leaf": randint(1, 5),
}

In [86]:
cv = RepeatedKFold(n_splits=3, random_state=42, n_repeats=3)

In [87]:
random_search = RandomizedSearchCV(rf, 
                                  param_distributions=params, 
                                  cv=cv, 
                                  n_iter=20,
                                  scoring='neg_mean_squared_error',
                                  verbose=1,
                                  n_jobs=-1
                                  )

In [88]:
random_search.fit(X_train, y_train)

Fitting 9 folds for each of 20 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.3min finished


RandomizedSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=3, random_state=42),
                   estimator=RandomForestRegressor(random_state=42), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde03e4aed0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde03e4a1d0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fde03e60c10>},
                   scoring='neg_mean_squared_error', verbose=1)

In [93]:
print(random_search.best_estimator_)
print(random_search.best_params_)
print(random_search.best_index_)
print(random_search.best_score_)

RandomForestRegressor(min_samples_split=3, n_estimators=382, random_state=42)
{'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 382}
4
-0.009400818254679692


In [94]:
# 모델 앙상블
rf_1 = grid.best_estimator_
rf_2 = random_search.best_estimator_

In [95]:
rf_1.fit(X_train, y_train)
rf_2.fit(X_train, y_train)

RandomForestRegressor(min_samples_split=3, n_estimators=382, random_state=42)

In [96]:
pred_1 = rf_1.predict(test_data)
pred_2 = rf_2.predict(test_data)

In [101]:
pred = (pred_1 * 0.5) + (pred_2 * 0.5)
pred

array([17.77528296, 18.15555852, 18.01957851, ..., 10.99291125,
       10.71435439, 10.82191926])

In [106]:
np.random.seed(42)

In [110]:
# 앙상블 모델에 + bagging
prediction_list = [] # 예측값을 저장한 list
for _ in tqdm(range(10)):
    data_index = [idx for idx in range(X_train.shape[0])] # X_train 내에 있는 index
    random_index = np.random.choice(data_index, X_train.shape[0], replace=True) # 인덱스 복원 추출
    
    # Search하여 찾은 하이퍼 파라미터를 넣어줌
    rf = RandomForestRegressor(**random_search.best_params_)
    rf.fit(X_train.iloc[random_index, ], y_train.iloc[random_index, ])
    
    pred = rf.predict(test_data)
    pred = np.expm1(pred)
    
    prediction_list.append(pred)

100%|██████████| 10/10 [00:53<00:00,  5.30s/it]


In [112]:
prediction_list[0]

array([6.02669847e+07, 7.38100834e+07, 6.97181995e+07, ...,
       5.68506164e+04, 4.22758061e+04, 4.94609573e+04])

In [114]:
prediction = []
for i in range(test_data.shape[0]):
    
    temp = []
    for j in range(len(prediction_list)):
        temp.append(prediction_list[j][i])
    prediction.append(np.mean(temp))

In [116]:
len(prediction)

3828

In [128]:
np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 10, replace=True)

array([5, 2, 5, 8, 1, 2, 2, 2, 5, 1])