In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold,StratifiedKFold

In [2]:
kaggle_data_folder = "/home/vincent/Kaggle/data/ventilator-pressure-prediction"
output_folder = "../output/"

In [3]:
train = pd.read_csv(kaggle_data_folder + "/train.csv")

In [4]:
seed = 48

In [6]:
RC_train = train.groupby("breath_id")[["R","C"]].last()
RC_train.reset_index(inplace=True)
RC_train["R_C"] = RC_train["R"].astype(str) + "_" + RC_train["C"].astype(str)

In [7]:
RC_train

Unnamed: 0,breath_id,R,C,R_C
0,1,20,50,20_50
1,2,20,20,20_20
2,3,50,20,50_20
3,4,50,50,50_50
4,5,5,50,5_50
...,...,...,...,...
75445,125740,50,50,50_50
75446,125742,20,10,20_10
75447,125743,20,10,20_10
75448,125745,50,50,50_50


In [8]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
splits = cv.split(RC_train, RC_train['R_C'])

In [9]:
RC_train['fold'] = -1
for i,(train_idx, valid_idx) in enumerate(splits):
    print(i)
    RC_train.loc[valid_idx,"fold"] = i

0
1
2
3
4


In [11]:
# double check
# check id
RC_train.groupby("fold")["breath_id"].unique().apply(len).sum() == len(train.breath_id.unique())

True

In [12]:
# check R_C group
RC_train.groupby(["fold","R_C"]).size()

fold  R_C  
0     20_10    1214
      20_20    1241
      20_50    1638
      50_10    2735
      50_20    1652
      50_50    1638
      5_10     1662
      5_20     1656
      5_50     1654
1     20_10    1214
      20_20    1242
      20_50    1637
      50_10    2736
      50_20    1652
      50_50    1638
      5_10     1662
      5_20     1655
      5_50     1654
2     20_10    1214
      20_20    1242
      20_50    1637
      50_10    2736
      50_20    1652
      50_50    1638
      5_10     1662
      5_20     1655
      5_50     1654
3     20_10    1214
      20_20    1242
      20_50    1637
      50_10    2735
      50_20    1652
      50_50    1637
      5_10     1663
      5_20     1655
      5_50     1655
4     20_10    1214
      20_20    1241
      20_50    1637
      50_10    2735
      50_20    1652
      50_50    1638
      5_10     1663
      5_20     1656
      5_50     1654
dtype: int64

In [13]:
id_fold_dict = dict(zip(RC_train["breath_id"], RC_train["fold"]))

In [14]:
train['fold'] = train['breath_id'].apply(lambda x: id_fold_dict[x])

In [15]:
train.shape

(6036000, 9)

In [16]:
train.groupby(["fold","R","C"]).size()

fold  R   C 
0     5   10    132960
          20    132480
          50    132320
      20  10     97120
          20     99280
          50    131040
      50  10    218800
          20    132160
          50    131040
1     5   10    132960
          20    132400
          50    132320
      20  10     97120
          20     99360
          50    130960
      50  10    218880
          20    132160
          50    131040
2     5   10    132960
          20    132400
          50    132320
      20  10     97120
          20     99360
          50    130960
      50  10    218880
          20    132160
          50    131040
3     5   10    133040
          20    132400
          50    132400
      20  10     97120
          20     99360
          50    130960
      50  10    218800
          20    132160
          50    130960
4     5   10    133040
          20    132480
          50    132320
      20  10     97120
          20     99280
          50    130960
      50  10    21880

In [17]:
import pickle
with open('../output/id_fold_dict.pickle', 'wb') as handle:
    pickle.dump(id_fold_dict, handle)
print("save id_fold_dict")

save id_fold_dict


In [20]:
train.median(axis=0)

id           3.018000e+06
breath_id    6.276550e+04
R            2.000000e+01
C            2.000000e+01
time_step    1.308123e+00
u_in         4.386146e+00
u_out        1.000000e+00
pressure     7.032628e+00
fold         2.000000e+00
dtype: float64