## Imports

### Librairies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys 

from datetime import datetime, date, timedelta

import lightgbm as lgb

### Data

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
print(train.shape)
train.head()

(6036000, 7)


Unnamed: 0_level_0,breath_id,R,C,time_step,u_in,u_out,pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,20,50,0.0,0.083334,0,5.837492
2,1,20,50,0.033652,18.383041,0,5.907794
3,1,20,50,0.067514,22.509278,0,7.876254
4,1,20,50,0.101542,22.808822,0,11.742872
5,1,20,50,0.135756,25.35585,0,12.234987


## Création

### Split train/test set - validation set

In [3]:
l_breath_id = train.breath_id.unique().tolist()
nb_breath_id = len(l_breath_id)

random.seed(113)
validation_set_id = random.sample(l_breath_id, int(len(l_breath_id)/3))
validation_set = train[train.breath_id.isin(validation_set_id)]
train_test_set = train[train.breath_id.isin(validation_set_id) == False]

### Split different R & C lungs

In [4]:
train_test_set['R_C'] = train_test_set.R.astype(str) + train_test_set.C.astype(str)
l_r_c = train_test_set.R_C.unique().tolist()
l_train_test_set = []
for i, r_c in enumerate(l_r_c):
    l_train_test_set.append(train_test_set[train_test_set.R_C == r_c])

### Features

In [7]:
for i in range(len(l_train_test_set)):

    l_train_test_set[i]['u_in_sum'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('sum')
    l_train_test_set[i]['u_in_cumsum'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('cumsum')
    l_train_test_set[i]['u_in_std'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('std')
    l_train_test_set[i]['u_in_min'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('min')
    l_train_test_set[i]['u_in_max'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('max')
    l_train_test_set[i]['u_in_mean'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('mean')
    l_train_test_set[i]['u_in_median'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('median')
    l_train_test_set[i]['u_in_first'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('first')
    l_train_test_set[i]['u_in_last'] = l_train_test_set[i].groupby('breath_id')['u_in'].transform('last')
    l_train_test_set[i]['u_in_lag1']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(1)
    l_train_test_set[i]['u_in_lag2']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(2)
    l_train_test_set[i]['u_in_lag3']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(3)
    l_train_test_set[i]['u_in_lag4']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(4)
    l_train_test_set[i]['u_in_backlag1']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(-1)
    l_train_test_set[i]['u_in_backlag2']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(-2)
    l_train_test_set[i]['u_in_backlag3']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(-3)
    l_train_test_set[i]['u_in_backlag4']  = l_train_test_set[i].groupby('breath_id')['u_in'].shift(-4)
    l_train_test_set[i]['u_in_diff1'] = l_train_test_set[i]['u_in'] - l_train_test_set[i]['u_in_lag1']
    l_train_test_set[i]['u_in_diff2'] = l_train_test_set[i]['u_in'] - l_train_test_set[i]['u_in_lag2']
    l_train_test_set[i]['u_in_diff3'] = l_train_test_set[i]['u_in'] - l_train_test_set[i]['u_in_lag3']
    l_train_test_set[i]['u_in_diff4'] = l_train_test_set[i]['u_in'] - l_train_test_set[i]['u_in_lag4']
    l_train_test_set[i]['area'] = l_train_test_set[i]['time_step'] * l_train_test_set[i]['u_in']
    l_train_test_set[i]['area2'] = l_train_test_set[i].groupby('breath_id')['area'].cumsum()
    l_train_test_set[i]['breath_id__u_in__diffmax']  = l_train_test_set[i].groupby(['breath_id'])['u_in'].transform('max') \
                                                        - l_train_test_set[i]['u_in']
    l_train_test_set[i]['breath_id__u_in__diffmean'] = l_train_test_set[i].groupby(['breath_id'])['u_in'].transform('mean') \
                                                        - l_train_test_set[i]['u_in']
    l_train_test_set[i]['cross']                     = l_train_test_set[i]['u_in'] * l_train_test_set[i]['u_out']
    l_train_test_set[i]['cross2']                    = l_train_test_set[i]['time_step'] * l_train_test_set[i]['u_out']

    l_train_test_set[i]['u_out_sum'] = l_train_test_set[i].groupby('breath_id')['u_out'].transform('sum')
    l_train_test_set[i]['u_out_lag1']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(1)
    l_train_test_set[i]['u_out_lag2']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(2)
    l_train_test_set[i]['u_out_lag3']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(3)
    l_train_test_set[i]['u_out_lag4']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(4)
    l_train_test_set[i]['u_out_backlag1']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(-1)
    l_train_test_set[i]['u_out_backlag2']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(-2)
    l_train_test_set[i]['u_out_backlag3']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(-3)
    l_train_test_set[i]['u_out_backlag4']  = l_train_test_set[i].groupby('breath_id')['u_out'].shift(-4)
    l_train_test_set[i]['u_out_diff1'] = l_train_test_set[i]['u_out'] - l_train_test_set[i]['u_out_lag1']
    l_train_test_set[i]['u_out_diff2'] = l_train_test_set[i]['u_out'] - l_train_test_set[i]['u_out_lag2']
    l_train_test_set[i]['u_out_diff3'] = l_train_test_set[i]['u_out'] - l_train_test_set[i]['u_out_lag3']
    l_train_test_set[i]['u_out_diff4'] = l_train_test_set[i]['u_out'] - l_train_test_set[i]['u_out_lag4']

    
    l_train_test_set[i]['dt_1'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(1)
    l_train_test_set[i]['dt_2'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(2)
    l_train_test_set[i]['dt_3'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(3)
    l_train_test_set[i]['dt_4'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(4)
    l_train_test_set[i]['dt_5'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(5)
    l_train_test_set[i]['dt_6'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(6)
    l_train_test_set[i]['dt_7'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(7)
    l_train_test_set[i]['dt_8'] = l_train_test_set[i].groupby('breath_id')['time_step'].diff(8)
    
    l_train_test_set[i]['u_in_rate'] = l_train_test_set[i]['u_in_diff1'] / l_train_test_set[i]['dt_1']



### Exportation

In [8]:
for i in range(len(l_train_test_set)):
    l_train_test_set[i].to_csv(f'train_{i}.csv', index=False)
    
validation_set.to_csv('validation.csv', index=False)

In [None]:
l_train_test_set[0]