## Thank [@ehekatlact](https://www.kaggle.com/ehekatlact) for his excellent work.
## Most of this code is derived from his efforts.
## Take closely look at his code, [**"TPS2206 The na count of each record is critical!"**](https://www.kaggle.com/code/ehekatlact/tps2206-the-na-count-of-each-record-is-critical) and upvote.
---

motivation

I found that there is a significant difference in the prediction results between columns with only one na and columns with two or more na, using the previous method.

It seems that multiple columns of na filled with mean, etc. are interacting with each other and adversely affecting the results.

Predicting by how many na columns there are improves the score.

In [1]:
class CFG:
    num_workers = 2  # colabは4, kaggleは2?
    weight_decay = 0
    print_epoch_freq = 10
    max_epochs = 300
    max_batch_size = 2048
    hidden_size = 128
    lr = 2e-3
    min_lr = 1e-6
    patience = 10
    lr_patience = 3
    factor = 0.5
    seed = 42
    train = False
    debug = False

if CFG.debug:
    CFG.max_epochs=1
    na_col_list = na_col_list[:1]

In [2]:
# For Google Colab
"""
from google.colab import drive
drive.mount('/content/drive')

# Install kaggle packages
!pip install -q kaggle
!pip install -q kaggle-cli

# Lib
from google.colab import files

# Please Upload `kaggle.json` file
uploaded = files.upload()

# Then copy kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

!kaggle competitions download -c tabular-playground-series-jun-2022
!unzip -o tabular-playground-series-jun-2022.zip -d tabular-playground-series-jun-2022
!kaggle kernels output oxzplvifi/tps2206-gbm-resnet-imputation -p ./DataSet
"""

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n\n# Install kaggle packages\n!pip install -q kaggle\n!pip install -q kaggle-cli\n\n# Lib\nfrom google.colab import files\n\n# Please Upload `kaggle.json` file\nuploaded = files.upload()\n\n# Then copy kaggle.json into the folder where the API expects to find it.\n!mkdir -p ~/.kaggle\n!cp kaggle.json ~/.kaggle/\n!chmod 600 ~/.kaggle/kaggle.json\n!ls ~/.kaggle\n\n!kaggle competitions download -c tabular-playground-series-jun-2022\n!unzip -o tabular-playground-series-jun-2022.zip -d tabular-playground-series-jun-2022\n!kaggle kernels output oxzplvifi/tps2206-gbm-resnet-imputation -p ./DataSet\n"

In [3]:
%%capture

if CFG.train:
    !pip install wandb
!pip install pytorch_lightning

In [4]:
if CFG.train:
    import wandb
    try:
        # add-ons -> secrets -> set your wandb api key
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

In [5]:
if CFG.train:
    import os
    wandb.init(project="tps2206")

### Lib

In [6]:
# common
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import time, gc, string, math
from tqdm.notebook import tqdm
import warnings
import shutil
from collections import defaultdict
import heapq
import datetime
import random
from collections import OrderedDict
import glob
import copy

# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim import lr_scheduler

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger


In [7]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [8]:
# for google colab
# os.chdir("/content/drive/MyDrive/colab_data/TPS2206")

In [9]:
"""
os.makedirs('model', exist_ok=True)
shutil.rmtree('./model/')
os.makedirs('model', exist_ok=True)
"""

"\nos.makedirs('model', exist_ok=True)\nshutil.rmtree('./model/')\nos.makedirs('model', exist_ok=True)\n"

In [10]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    pl.utilities.seed.seed_everything(seed, workers=True)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CFG.seed)

### Read DF

In [11]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [12]:
for col in data.columns:
    if "F_4" not in col:
        data[col] = data[col].fillna(data[col].mean())

In [13]:
na_col_list = []
for col in data.columns:
    if data[col].isna().sum() != 0:
        na_col_list.append(col)

In [14]:
f4data = data[na_col_list]

In [15]:
na_index_of = {}
no_na_index_of = {}
for col in na_col_list:
    na_index = f4data[f4data[col].isna() == True].index
    na_index_of[col] = na_index
    no_na_index = f4data[f4data[col].isna() == False].index
    no_na_index_of[col] = no_na_index

In [16]:
na_cnt = pd.DataFrame(f4data.isna().sum(axis=1))

In [17]:
na_cnt.groupby([0]).size()

0
0    759268
1    211342
2     27127
3      2124
4       135
5         4
dtype: int64

In [18]:
na_cnt_max = 5
na_cnt_index_of = {}
for cnt in range(0, na_cnt_max+1):
    na_cnt_index = na_cnt[na_cnt[0] == cnt].index
    na_cnt_index_of[cnt] = na_cnt_index

In [19]:
def na_no_na_index_of(col, cnt):
    na_index = na_index_of[col]
    no_na_index = no_na_index_of[col]
    na_cnt_index = na_cnt_index_of[cnt]
    na_index = na_index.intersection(na_cnt_index)
    no_na_index = no_na_index.intersection(na_cnt_index)
    return na_index, no_na_index

In [20]:
na_no_na_index_of('F_4_0', 1)

(Int64Index([   105,    119,    182,    287,    517,    525,    550,    561,
                632,    640,
             ...
             999358, 999371, 999373, 999435, 999533, 999568, 999579, 999757,
             999883, 999905],
            dtype='int64', name='row_id', length=14175),
 Int64Index([     2,      4,     10,     12,     14,     24,     27,     28,
                 45,     50,
             ...
             999950, 999954, 999971, 999978, 999983, 999987, 999990, 999993,
             999994, 999997],
            dtype='int64', name='row_id', length=197167))

In [21]:
f4data = f4data.fillna(-1)

## Pytorch

### DataSet and DataLoader

In [22]:
class TrainDataset(Dataset):
    def __init__(self, X, y, na_num):
        self.X = X
        self.y = y
        self.na_num = na_num
        self.index_end = X.shape[1]
        self.index_list = [i for i in range(self.index_end)]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        if self.na_num != 0:
            X = self.X[item]
            # must use torch.rand* . np.random cause same seed with gpu.
            na_index_list = torch.randperm(self.index_end)[:self.na_num].tolist()
            for na_index in na_index_list:
                X[na_index] = -1
        else:
            X = self.X[item]
        inputs = torch.tensor(X, dtype=torch.float32)
        outputs = torch.tensor(self.y[item], dtype=torch.float32)

        return inputs, outputs

In [23]:
class DataModule(pl.LightningDataModule):
    # train, val, testの3つのDataLoaderを定義する
    # trainerにこれを渡すと、train, val, testのそれぞれのステップでこれを渡してくれる
    def __init__(self, X_train, y_train, X_valid, y_valid, X_test, na_num, batch_size):
        self.X_train = X_train.values
        self.y_train = y_train.values
        self.X_valid = X_valid.values
        self.y_valid = y_valid.values
        self.X_test = X_test.values
        self.y_test = np.zeros(X_test.shape[0])
        self.na_num = na_num
        self.batch_size = batch_size
        self._log_hyperparams = None  # ナニコレ・・・

    def train_dataloader(self):
        ds = TrainDataset(self.X_train, self.y_train, self.na_num)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def val_dataloader(self):
        ds = TrainDataset(self.X_valid, self.y_valid, self.na_num)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def predict_dataloader(self):
        ds = TrainDataset(self.X_test, self.y_test, 0)  # when predict, already fill -1
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def prepare_data_per_node(self):
        # TODO 本来要らないはずなんだけど・・・
        pass

    def teardown(self, stage=None):
        torch.cuda.empty_cache()  # TODO: これであってるのか不明　何も出てこないんだよね
        gc.collect()

### Pytorch Model

In [24]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        hidden_size = hidden_size
        output_size = 1
        emb_dim = 8
        self.fc1 = nn.Linear(input_size, hidden_size*4)
        self.bn1 = nn.BatchNorm1d(hidden_size*4)
        self.fc2 = nn.Linear(hidden_size*4, hidden_size*4)
        self.fc3 = nn.Linear(hidden_size*4, hidden_size*2)
        self.fc4 = nn.Linear(hidden_size*2, hidden_size)
        self.fc5 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # dropoutとbnの併用禁止
        # bnは活性化関数の前に
        x = F.mish(self.bn1(self.fc1(x)))
        x = F.mish(self.fc2(x))
        x = F.mish(self.fc3(x))
        x = F.mish(self.fc4(x))
        x = self.fc5(x)
        return x

In [25]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat,y))

In [26]:
class NNModel(pl.LightningModule):
    # https://pytorch-lightning.readthedocs.io/en/stable/notebooks/lightning_examples/basic-gan.html
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.criterion = RMSELoss()
        self.lr = CFG.lr

    def forward(self, x) -> torch.Tensor:
        return self.model(x)

    # Setup Optimizer and Scheduler
    def configure_optimizers(self):
        model_params = [p for n, p in self.model.named_parameters()]
        optimizer_params = [
            {"params":  model_params,
             "weight_decay": CFG.weight_decay,
             "lr": CFG.lr
            },
        ]

        optimizer = optim.Adam(optimizer_params)

        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                'min',
                                                patience=CFG.lr_patience,
                                                factor=CFG.factor
                                                )
        interval = "epoch"
        monitor = "valid_avg_loss"

        return [optimizer], [{"scheduler": scheduler, "interval": interval, "monitor": monitor}]

    # training valid test steps
    def training_step(self, batch_data, batch_idx):
        # batch_data: DataModuleで定義したtrain_dataloaderの結果
        # 戻値: lossであることが必須(裏でoptimizerに渡すため)
        X, y = batch_data
        op = self(X).squeeze()
        loss = self.criterion(op, y)
        return loss

    def training_epoch_end(self, outputs):
        # 1epoch分の処理(全バッチの処理)のreturn値をlistで受け取る
        loss_list = [x['loss'] for x in outputs]
        avg_loss = torch.stack(loss_list).mean()
        self.log('train_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "train_avg_loss:", avg_loss.item())

    def validation_step(self, batch_data, batch_idx):
        # 戻値: 任意の辞書
        X, y = batch_data
        op = self(X).squeeze()
        loss = self.criterion(op, y)
        return {'valid_loss': loss}

    def validation_epoch_end(self, outputs):
        loss_list = [x['valid_loss'] for x in outputs]
        avg_loss = torch.stack(loss_list).mean()
        self.log('valid_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "valid_avg_loss:", avg_loss.item())
        return avg_loss

    def predict_step(self, batch_data, batch_idx):
        # 実際に予測させるときに使う
        X, _ = batch_data
        outputs = self(X).squeeze()
        # criterionがwithLogit系の場合は、sigmoidを追加する。
        # outputs = torch.sigmoid(outputs)
        return outputs

In [27]:
checkpoint_path_of = defaultdict(str)

In [28]:
if CFG.train:
    model_name_prefix = datetime.datetime.now().strftime('%m%d%H%M%S')
else:
    model_name_prefix = "0616004555"

for cnt in range(0, na_cnt_max):  # uso cnt+1
    print("="*10, "na_cnt {}/{}".format(cnt+1, na_cnt_max), "="*10)
    result_f4data = copy.deepcopy(f4data)
    for col in na_col_list:
        print("="*10, col, "="*10)
        # split data
        # train cnt == 0 then test cnt == 1
        _, no_na_index = na_no_na_index_of(col, 0)  # select non na records.
        train = f4data.loc[no_na_index]
        na_index, _ = na_no_na_index_of(col, cnt+1)
        if len(na_index) == 0:
            break
        test = f4data.loc[na_index]
        X = train.drop(col, axis=1)
        y = train[col]
        X_test = test.drop(col, axis=1)
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)

        # data module
        batch_size = min(CFG.max_batch_size, (len(X_train)+100-1)//100)  # len(X) == batch then raise errer.
        print("batch_size :{}".format(batch_size))
        if CFG.train:
            dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, cnt, batch_size)

        # create model
        cur_model_name = "model" + model_name_prefix+"_" + col+ "_" + str(cnt)
        dirpath = "../input/tps-jun-2022-model/"
        if cnt > 0:
            CFG.hidden_size = 256
        else:
            CFG.hidden_size = 128            
        dnn = DNN(X_train.shape[1], CFG.hidden_size)
        model = NNModel(dnn)

        # train
        if CFG.train:
            logger = WandbLogger(project="tps2206")
            logger.log_hyperparams(CFG.__dict__)
            callbacks = [
                        pl.callbacks.EarlyStopping('valid_avg_loss', patience=CFG.patience),  # validation_epoch_endの戻値が10ターン改善がなかったら打ち止め
                        pl.callbacks.ModelCheckpoint(dirpath="./model/", filename=cur_model_name, save_top_k=1, monitor="valid_avg_loss", save_weights_only=False),  # model保存の設定
                        pl.callbacks.LearningRateMonitor(),  # ログに学習率を吐き出す設定
            ]
            trainer = pl.Trainer(accelerator="auto", devices="auto", max_epochs=CFG.max_epochs, logger=logger, callbacks=callbacks, enable_progress_bar=False)
            trainer.fit(model, datamodule=dm)
            wandb.finish()
        else:
            trainer = pl.Trainer(accelerator="auto", devices="auto", max_epochs=CFG.max_epochs, logger=None, callbacks=None, enable_progress_bar=False)

        # load_best_model
        checkpoint_path = glob.glob(dirpath+cur_model_name+"*.ckpt")[0]
        model.load_from_checkpoint(checkpoint_path, model=dnn)
        checkpoint_path_of[cur_model_name] = checkpoint_path

        # predict
        dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, cnt, batch_size)
        results = trainer.predict(model=model, datamodule=dm)
        preds = []
        for batch in results:
            preds.append(batch)
        outputs = torch.cat(preds, dim=0)

        # write result
        result_f4data.loc[na_index, col] = outputs.tolist()
        display(result_f4data.loc[na_index, col].head())

        torch.cuda.empty_cache()
        gc.collect()
    f4data = result_f4data
    f4data.to_pickle("f4data_{}.pkl".format(cnt))

batch_size :2048


row_id
105   -1.634373
119    1.013145
182    0.881204
287   -0.491367
517    1.575665
Name: F_4_0, dtype: float64

batch_size :2048


row_id
24    -5.423626
132   -2.428746
145    1.676639
201   -3.882275
225    1.035022
Name: F_4_1, dtype: float64

batch_size :2048


row_id
2      0.381250
104    1.325681
198    0.267628
253    0.116166
347   -0.784562
Name: F_4_2, dtype: float64

batch_size :2048


row_id
14     0.127462
27    -0.755440
28     0.171232
290    0.806800
316   -0.472985
Name: F_4_3, dtype: float64

batch_size :2048


row_id
4     -0.412597
72    -0.041183
77     0.245139
118   -1.505598
187    1.646061
Name: F_4_4, dtype: float64

batch_size :2048


row_id
50    -0.360714
75     2.290415
166   -0.950644
189   -4.306607
293   -1.686054
Name: F_4_5, dtype: float64

batch_size :2048


row_id
168   -0.245529
244   -1.490716
272   -4.087219
277    0.794757
309   -1.365842
Name: F_4_6, dtype: float64

batch_size :2048


row_id
178   -0.845200
228    1.442454
419   -5.153642
444    1.454614
664   -0.075948
Name: F_4_7, dtype: float64

batch_size :2048


row_id
45     0.371692
58     1.391806
82     0.802778
179   -0.387944
192    0.225518
Name: F_4_8, dtype: float64

batch_size :2048


row_id
54     0.367022
96    -1.877840
157    1.069459
170   -0.726467
203    0.301677
Name: F_4_9, dtype: float64

batch_size :2048


row_id
80     0.179157
84    -0.087247
338    0.900430
396    0.651810
404   -0.343128
Name: F_4_10, dtype: float64

batch_size :2048


row_id
239    11.862947
255    -1.243334
280    -5.892267
363    -2.739441
401    -2.183784
Name: F_4_11, dtype: float64

batch_size :2048


row_id
10     2.470927
57    -0.913595
88     1.357721
181   -0.479691
232    3.157483
Name: F_4_12, dtype: float64

batch_size :2048


row_id
177   -1.315935
208    0.965489
234   -0.666452
562    0.143662
586   -0.544107
Name: F_4_13, dtype: float64

batch_size :2048


row_id
12     0.672088
233    0.719558
336   -0.093349
446    0.936427
508    0.554327
Name: F_4_14, dtype: float64

batch_size :2048


row_id
205     1.699801
224     2.405856
390     2.856384
1003    3.684433
2130    1.310680
Name: F_4_0, dtype: float64

batch_size :2048


row_id
184    -3.961794
918    -4.649046
2344   -0.050346
2400   -0.666612
3066    2.458075
Name: F_4_1, dtype: float64

batch_size :2048


row_id
61      0.918394
994     0.790222
1003   -0.358693
1086   -0.232820
1283   -1.386620
Name: F_4_2, dtype: float64

batch_size :2048


row_id
7      -0.326008
99     -0.560844
945     0.320521
1144   -0.102726
1289   -0.912238
Name: F_4_3, dtype: float64

batch_size :2048


row_id
48      0.213659
205    -2.196002
283    -0.200202
1238    0.066545
1921    4.392175
Name: F_4_4, dtype: float64

batch_size :2048


row_id
740    -0.832226
884     0.719931
982     2.870744
1325    1.971923
1392    3.136975
Name: F_4_5, dtype: float64

batch_size :2048


row_id
886     3.060002
945     0.704724
1035    0.459297
1352   -0.334399
2000   -1.560851
Name: F_4_6, dtype: float64

batch_size :2048


row_id
136     1.345287
503    -6.082983
1312    0.582583
2002    1.548453
2293    1.866508
Name: F_4_7, dtype: float64

batch_size :2048


row_id
21     -0.365073
36     -1.269623
224    -0.100778
285    -0.849421
1356   -0.044344
Name: F_4_8, dtype: float64

batch_size :2048


row_id
136   -0.770769
465    0.649954
503   -1.053682
740   -0.153134
982   -0.356774
Name: F_4_9, dtype: float64

batch_size :2048


row_id
283    0.094229
465    1.518900
644   -0.776624
886    0.841743
988    0.279197
Name: F_4_10, dtype: float64

batch_size :2048


row_id
285     9.544481
390    -2.200431
644    -2.844061
1040   -1.651034
1144   -1.682082
Name: F_4_11, dtype: float64

batch_size :2048


row_id
21     -2.553397
988     2.077992
1028    5.061216
1035   -1.619457
1086   -1.348146
Name: F_4_12, dtype: float64

batch_size :2048


row_id
61    -0.084962
99     3.645552
184    0.772728
884    3.009645
918   -1.550384
Name: F_4_13, dtype: float64

batch_size :2048


row_id
7       1.523982
36      0.821785
48      0.964082
1040    0.210350
1325   -1.430433
Name: F_4_14, dtype: float64

batch_size :2048


row_id
5852   -1.004812
6714    0.338566
7704    2.433562
8462    2.282016
8724   -1.023993
Name: F_4_0, dtype: float64

batch_size :2048


row_id
2137   -0.873558
4097   -0.874434
4494    1.958435
4583    1.225716
5722   -3.515440
Name: F_4_1, dtype: float64

batch_size :2048


row_id
1916    -0.379457
2215    -0.221842
4162     0.002437
5630     0.214172
10750    1.317306
Name: F_4_2, dtype: float64

batch_size :2048


row_id
5722   -0.740589
6033    1.251220
7704   -0.269593
8462    1.632451
8548    0.111187
Name: F_4_3, dtype: float64

batch_size :2048


row_id
1916    -0.081269
5269    -1.049717
6714     4.289262
11296    1.394837
14100    1.413940
Name: F_4_4, dtype: float64

batch_size :2048


row_id
1916    1.984763
4097   -0.648068
4319    1.627416
4583    0.848341
5269   -4.558091
Name: F_4_5, dtype: float64

batch_size :2048


row_id
2137     3.731117
2396    -0.286204
6033     3.854873
8548     3.865636
10861    2.642621
Name: F_4_6, dtype: float64

batch_size :2048


row_id
4494     1.236741
5852    -0.015690
9220     1.531490
13929   -2.057395
14753   -1.335544
Name: F_4_7, dtype: float64

batch_size :2048


row_id
4319     0.187782
5269    -0.301321
8724    -1.668993
13454   -1.556348
15985   -0.374573
Name: F_4_8, dtype: float64

batch_size :2048


row_id
2215   -0.512768
3080   -0.562065
4162    1.394209
5630   -0.737302
6033    0.626068
Name: F_4_9, dtype: float64

batch_size :2048


row_id
2396     0.730935
9220    -0.164271
10096    0.066877
10861    0.535830
14753   -0.096397
Name: F_4_10, dtype: float64

batch_size :2048


row_id
3080    -3.495719
4494    -3.028877
4583     2.056704
5630     0.611644
11623   -2.551460
Name: F_4_11, dtype: float64

batch_size :2048


row_id
2137    2.407748
2396    1.236222
5722   -1.463466
5852    3.795024
8462    0.581439
Name: F_4_12, dtype: float64

batch_size :2048


row_id
3080    2.341469
4097   -2.295514
4162    2.714059
4319   -1.469319
8548    1.661139
Name: F_4_13, dtype: float64

batch_size :2048


row_id
2215     0.267021
8724     0.037421
8816    -0.426575
10096   -0.013701
10750    0.017221
Name: F_4_14, dtype: float64

batch_size :2048


row_id
35966    1.149127
41668   -1.692091
80447    1.569456
94371    1.174505
94997   -0.486701
Name: F_4_0, dtype: float64

batch_size :2048


row_id
10837    -1.669125
63138     0.553326
140345    1.899366
232776   -2.878411
239705    1.018400
Name: F_4_1, dtype: float64

batch_size :2048


row_id
19917   -0.061364
35966   -0.075576
41668   -0.243006
74704    1.084618
80447   -0.795189
Name: F_4_2, dtype: float64

batch_size :2048


row_id
63138    -1.295143
156843   -0.476723
162191    0.410862
171791   -0.128647
192112   -0.330265
Name: F_4_3, dtype: float64

batch_size :2048


row_id
33176     1.717199
94997    -1.316978
215436   -4.453103
221703   -4.003464
232776   -2.574420
Name: F_4_4, dtype: float64

batch_size :2048


row_id
10837     0.615663
41668     0.894099
74704     0.090591
80701     2.026424
113659    2.659568
Name: F_4_5, dtype: float64

batch_size :2048


row_id
80447    -0.206956
80701     2.162738
99587     1.515291
139774   -2.894595
140345   -1.408505
Name: F_4_6, dtype: float64

batch_size :2048


row_id
22717     2.177988
33176     3.830238
80701     0.082067
193358   -0.838098
205726   -0.915739
Name: F_4_7, dtype: float64

batch_size :2048


row_id
10837     0.041000
80701    -1.807268
97865    -0.037943
167485   -0.404568
171791    0.776974
Name: F_4_8, dtype: float64

batch_size :2048


row_id
33176    0.688659
35966    0.793801
63138   -0.652426
74704   -1.028205
94371   -0.686912
Name: F_4_9, dtype: float64

batch_size :2048


row_id
19917    -0.053499
22717     0.634411
33176     0.675496
99587    -0.172389
139774    0.471360
Name: F_4_10, dtype: float64

batch_size :2048


row_id
19917    -1.761051
41668     5.426489
80447    -0.848825
166501    6.210108
239705    1.614492
Name: F_4_11, dtype: float64

batch_size :2048


row_id
10837     3.107306
22717    -1.162996
35966     1.884212
97865     1.923842
113659    0.894861
Name: F_4_12, dtype: float64

batch_size :2048


row_id
74704     1.357410
94371    -2.694349
156843    0.584138
162191    1.611386
171791    0.208974
Name: F_4_13, dtype: float64

batch_size :2048


row_id
19917    0.403600
22717   -0.235522
63138   -0.051665
94371   -0.022698
94997    0.689648
Name: F_4_14, dtype: float64



In [29]:
data.loc[:, f4data.columns] = f4data

In [30]:
data.head()

Unnamed: 0_level_0,F_1_0,F_1_1,F_1_2,F_1_3,F_1_4,F_1_5,F_1_6,F_1_7,F_1_8,F_1_9,F_1_10,F_1_11,F_1_12,F_1_13,F_1_14,F_2_0,F_2_1,F_2_2,F_2_3,F_2_4,F_2_5,F_2_6,F_2_7,F_2_8,F_2_9,F_2_10,F_2_11,F_2_12,F_2_13,F_2_14,F_2_15,F_2_16,F_2_17,F_2_18,F_2_19,F_2_20,F_2_21,F_2_22,F_2_23,F_2_24,F_3_0,F_3_1,F_3_2,F_3_3,F_3_4,F_3_5,F_3_6,F_3_7,F_3_8,F_3_9,F_3_10,F_3_11,F_3_12,F_3_13,F_3_14,F_3_15,F_3_16,F_3_17,F_3_18,F_3_19,F_3_20,F_3_21,F_3_22,F_3_23,F_3_24,F_4_0,F_4_1,F_4_2,F_4_3,F_4_4,F_4_5,F_4_6,F_4_7,F_4_8,F_4_9,F_4_10,F_4_11,F_4_12,F_4_13,F_4_14
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
0,-0.354591,-0.464038,2.304115,0.734486,1.696395,0.136285,-0.518344,0.50264,-1.852504,-0.500665,-1.416075,1.201521,0.551902,-0.759827,-0.000905,2,2,0,3,2,1,1,3,2,2,3,3,2,5,4,1,0,1,0,2,1,2,0,1,2,-0.240522,0.061529,0.56109,-0.171943,-0.431996,0.473508,0.596924,0.819306,1.479061,1.264616,-1.116881,0.759443,-0.086915,-0.620685,0.057216,1.07638,-0.780608,-1.940907,-0.717021,0.599093,0.498347,0.11877,-0.228913,0.000365,0.30161,5.547214,1.066871,-0.134313,-0.10104,-0.660871,3.744152,0.794438,0.265185,-0.561809,0.19648,0.373434,6.206995,3.809505,1.236486,1.182055
1,1.38094,-0.499626,-0.418548,1.911725,-0.82613,-1.715371,-0.577091,-1.041486,0.596067,-0.363425,-0.85363,0.674525,0.843058,-0.041438,0.259496,3,5,2,1,4,2,3,2,0,0,9,3,1,4,1,2,1,0,1,1,1,1,3,2,5,-0.446068,1.433358,0.040099,-1.994062,-0.602324,-0.611391,-1.151884,0.065485,0.352023,-0.843751,1.167272,0.921445,-0.839827,0.759015,-1.547387,0.720435,-0.944045,1.796462,-1.046357,-0.581515,0.704543,0.375222,0.705963,0.032771,-0.000817,-1.707374,-1.188114,-0.562419,-1.462988,1.290672,-2.895826,-0.738275,2.361818,-0.060753,0.727249,-0.271882,5.232157,-4.218259,-2.724883,-0.063775
2,0.256023,-1.059874,0.000551,0.345678,1.513814,1.243864,-0.509648,-0.800481,-0.115945,0.595777,-0.073235,-1.381605,-0.108676,0.703693,-0.464042,3,2,1,3,2,1,0,6,1,1,3,4,1,2,2,1,2,2,4,8,1,5,1,2,4,-0.770592,0.483139,-0.636484,-1.305018,-2.089889,0.276761,-1.20864,-0.855769,0.232363,0.215841,-1.031405,0.582437,-0.314639,-0.497409,0.489356,0.915049,-0.51341,0.904206,-0.056089,0.212927,-0.574126,-1.517749,-0.888472,0.142264,1.000822,1.914908,3.877128,0.38125,0.358635,0.443973,2.252834,0.472496,2.491386,0.353381,-0.260682,-0.000833,-0.116457,-2.131747,3.661499,-0.131576
3,-0.72842,-2.432399,-2.453602,-0.020509,0.333397,0.086049,-1.787601,0.667011,0.761564,-2.217847,-0.618973,0.742112,0.494157,0.744673,-0.769181,2,3,2,1,1,1,2,0,6,2,6,2,1,1,2,3,1,2,1,2,6,2,1,0,2,-0.69102,-1.003026,0.868989,0.46492,0.00129,0.499838,0.203723,-0.451576,-1.233499,0.903493,1.144558,-1.479893,-0.414316,0.02482,-0.976814,0.176633,-0.940022,-1.918049,1.506448,0.604388,0.57825,-0.122676,-0.711811,0.228523,0.999599,-2.638262,0.546676,0.8654,-0.857077,2.667105,2.0046,-4.664806,-0.847211,-0.264249,0.664334,-0.557868,8.499483,-4.738799,-3.054611,0.494152
4,0.590212,-0.066127,0.468009,-1.096038,0.119399,-1.80971,0.466358,-0.053196,-0.58032,-1.1435,1.338692,1.19333,1.03801,-0.763692,0.513466,5,0,1,3,3,0,2,5,1,3,5,5,4,4,2,0,2,3,2,5,2,2,1,0,4,1.196402,1.076537,0.486482,0.424305,0.475876,0.426978,-1.668823,-0.843056,1.00898,-0.704152,-0.894834,1.491608,-0.149938,1.25633,2.427308,0.878677,0.594314,0.380175,0.666499,-0.664403,1.276316,-0.059327,-1.276574,-0.768874,1.852815,-0.230342,-0.459019,1.128705,-0.748683,-0.412597,0.976937,2.558883,3.377724,0.846891,0.696032,0.554121,-5.979714,-2.869631,3.733057,-0.722943


In [31]:
ind_list = []
val_list = []
for i in tqdm(sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    val = data[col][row]
    ind_list.append(i)
    val_list.append(val)

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [32]:
sub['value'].loc[ind_list] = val_list

In [33]:
sub.to_csv("submission.csv", index=True)
sub

Unnamed: 0_level_0,value
row-col,Unnamed: 1_level_1
0-F_1_14,-0.000905
0-F_3_23,0.000365
1-F_3_24,-0.000817
2-F_1_2,0.000551
2-F_4_2,0.381250
...,...
999993-F_4_2,-0.114472
999994-F_3_10,0.001706
999994-F_4_9,-0.115102
999997-F_3_14,0.000727


TODO: fine tune版も試してみる。案外悪くないかも・・・？