In [11]:
import torch
from torch.nn import DataParallel
from torch.utils.data import DataLoader
from torch.optim import AdamW

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
import os
import gc
from tqdm import tqdm
import sys
sys.path.append("../src")
from config import read_config, update_config
from metric import cal_mae_metric
from dataset import read_data
from util import smart_avg

In [2]:
config = read_config("Base")
config.gpu = [0]
config = update_config(config)

Using device: cuda
Number of device: 1
Model Output Folder: /home/vincent/Kaggle/GBVPP/output/Base/


In [12]:
subs = {
    "LSTM5_REG_PL": "../output/LSTM5_REG_PL/test_pred_all_18832.csv",
    "LSTM5_CLS_DO02_OP01_PL": "../output/LSTM5_CLS_DO02_OP01_PL/test_pred_all_18722.csv"
}
ensemble_folder = "ensemble_1030"
if not os.path.exists(config.output_folder + "/" + ensemble_folder):
    ensemble_folder = config.output_folder + "/" + ensemble_folder
    os.makedirs(ensemble_folder)

In [4]:
train, test = read_data(config)

In [5]:
pressure_unique = np.load("/home/vincent/Kaggle/GBVPP/input/pressure_unique.npy")

In [14]:
sub_all = None
for key, sub_file in tqdm(subs.items()):
    train[key] = 0
    tmp = pd.read_csv(sub_file)
    if sub_all is None:
        sub_all = tmp[["id","pressure"]].copy()
        sub_all["pressure"] = 0
        
    for fold in range(5):
        train.loc[train.query(f"fold=={fold}").index,key] = torch.load(f'../output/{key}/Fold_{fold}_best_model.pth')['valid_preds'].flatten()
        sub_all[key+f"_{fold}"] = tmp[f"preds_fold{fold}"]
        
train["pressure_smart"] = smart_avg(train[list(subs.keys())], axis=1)
train["pressure_smart"] = train["pressure_smart"].map(lambda x: pressure_unique[np.abs(pressure_unique-x).argmin()])
CVs = train.groupby("fold").apply(lambda df: cal_mae_metric(df["pressure"],df["pressure_smart"],1-df["u_out"]))
CV = np.mean(CVs)

for fold in tqdm(range(5)):
    sub_keys = [key+f"_{fold}" for key in subs.keys()]
    test_fold_pred = smart_avg(sub_all[sub_keys], axis=1)
    sub_fold = sub_all[["id"]].copy()
    sub_fold[f'preds_fold{fold}'] = test_fold_pred
    sub_fold[f'preds_fold{fold}'] = sub_fold[f'preds_fold{fold}'].map(lambda x: pressure_unique[np.abs(pressure_unique-x).argmin()]) 
    sub_fold.to_csv(ensemble_folder + f"/test_fold{fold}.csv",index=False)

print("Ensemble CV Median: Mean {:.4f}, std {:.4f}".format(np.mean(CVs), np.std(CVs)))
preds_cols = [key+f"_{i}"  for key in subs.keys() for i in range(5)]
sub_all["pressure"] = smart_avg(sub_all[preds_cols], axis=1)
sub_all["pressure"] = sub_all["pressure"].map(lambda x: pressure_unique[np.abs(pressure_unique-x).argmin()])       
sub_all.to_csv(ensemble_folder + f"/sub_all_{CV*1e5:.0f}.csv")
sub_all[["id","pressure"]].to_csv(ensemble_folder + f"/submission_{CV*1e5:.0f}.csv",index=False)
print(sub_all.shape, train.shape)
sub_all

100%|█████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.82s/it]


Inliers:  2193854 -> compute mean
Outliers: 3842146 -> compute median
Total:    6036000


  0%|                                                                                     | 0/5 [00:00<?, ?it/s]

Inliers:  1504614 -> compute mean
Outliers: 2519386 -> compute median
Total:    4024000


 20%|███████████████▍                                                             | 1/5 [00:17<01:08, 17.02s/it]

Inliers:  1511843 -> compute mean
Outliers: 2512157 -> compute median
Total:    4024000


 40%|██████████████████████████████▊                                              | 2/5 [00:34<00:51, 17.15s/it]

Inliers:  1498497 -> compute mean
Outliers: 2525503 -> compute median
Total:    4024000


 60%|██████████████████████████████████████████████▏                              | 3/5 [00:51<00:34, 17.11s/it]

Inliers:  1508946 -> compute mean
Outliers: 2515054 -> compute median
Total:    4024000


 80%|█████████████████████████████████████████████████████████████▌               | 4/5 [01:08<00:17, 17.18s/it]

Inliers:  1502129 -> compute mean
Outliers: 2521871 -> compute median
Total:    4024000


100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [01:25<00:00, 17.17s/it]


Ensemble CV Median: Mean 0.1773, std 0.0016
Inliers:  1189049 -> compute mean
Outliers: 2834951 -> compute median
Total:    4024000
(4024000, 12) (6036000, 12)


Unnamed: 0,id,pressure,LSTM5_REG_PL_0,LSTM5_REG_PL_1,LSTM5_REG_PL_2,LSTM5_REG_PL_3,LSTM5_REG_PL_4,LSTM5_CLS_DO02_OP01_PL_0,LSTM5_CLS_DO02_OP01_PL_1,LSTM5_CLS_DO02_OP01_PL_2,LSTM5_CLS_DO02_OP01_PL_3,LSTM5_CLS_DO02_OP01_PL_4
0,1,6.259305,6.189002,6.189002,6.189002,6.329606,6.259304,6.259305,6.259305,6.259305,6.259305,6.259305
1,2,5.978096,5.907794,5.978096,6.048398,5.978096,6.048398,5.978096,5.978096,5.907794,5.978096,5.978096
2,3,7.102930,7.102930,7.173232,7.173232,7.102930,7.102930,7.102930,7.102930,7.173232,7.173232,7.032628
3,4,7.665347,7.595045,7.665347,7.595045,7.665347,7.665347,7.665347,7.665347,7.665347,7.665347,7.595045
4,5,9.141693,9.141692,9.141692,9.141692,9.141692,9.071390,9.141693,9.141693,9.071390,9.141693,9.141693
...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,4023996,5.275075,0.002414,0.002414,0.002414,0.002414,0.002414,21.233662,10.547735,14.695562,18.140367,15.257979
4023996,4023997,5.275075,0.002414,0.002414,0.002414,0.002414,0.002414,21.163359,10.547735,14.695562,18.070065,15.398583
4023997,4023998,5.345377,0.002414,0.002414,0.002414,0.002414,0.002414,21.163359,10.758642,14.695562,18.140367,15.398583
4023998,4023999,5.275075,0.002414,0.002414,0.002414,0.002414,0.002414,21.163359,10.547735,14.484656,18.702784,15.328281


In [15]:
sub_pub = pd.read_csv("../output/submission_1362.csv")

In [16]:
sub_all["pressure"] = ((sub_all["pressure"] + sub_pub["pressure"])/2).map(lambda x: pressure_unique[np.abs(pressure_unique-x).argmin()])

In [17]:
sub_all[["id","pressure"]].to_csv(ensemble_folder + f"/submission_{CV*1e5:.0f}_with_pub.csv",index=False)