# **Title Name :  인공호흡기 압력예측**

<p style="font-weight:bolder; font-size : 21px">
    Step : EDA 및 가설수립 [step:2]
<p>
<p style="font-weight:bolder; font-size : 21px">
   RegDate : 2023.11.07
<p>

------------------------------------------------------------

# 0. 환경설정
--------------------------------------------------------------------------------

In [1]:
#==================================================
# 모듈 불러오기
#==================================================

# 시스템
import os
import sys
import random
from time import time

# 데이터분석 4종세트
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# 파이토치
from torch import torch
import torch.nn as nn
from torch.utils.data.dataloader import DataLoader
import torch.optim as optim
import torchvision
from torchvision import transforms
# from torchinfo import summary    # require : pip install -qqq torchinfo


# 사이킷런
import sklearn
# from sklearn.preprocessing import RobustScaler
# from sklearn.model_selection import GroupKFold, train_test_split


# 유틸기능
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')


# 그래프 테마
sns.set_theme(style='whitegrid')
plt.style.use(['dark_background'])

In [2]:
#==================================================
# 시드설정
#==================================================
# 시드설정
SEED = 2023
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# deterministic 설정
deterministic = True
if deterministic :
    torch.backends.cudnn.deterministic  = True
    torch.backends.cudnn.benchmark      = False

In [3]:
#==================================================
# 데이터 패스설정
#==================================================

# 코랩패스설정
if 'google.colab' in sys.modules :
    from google.colab import drive
    drive.mount('/content/drive/')
    base_path = '/content/drive/MyDrive/프로젝트/[DL]인공호흡기압력예측/data/'

# 로컬패스설정
else : base_path = './data/'


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
#==================================================
# 데이터 불러오기
#==================================================
df_train = pd.read_csv(base_path+'train.csv')
df_test  = pd.read_csv(base_path+'test.csv')

In [5]:
#==================================================
# 버전체크
#==================================================
print(f'numpy_Ver           :   {np.__version__}버전')
print('-'*50)
print(f'pandas_Ver          :   {pd.__version__}버전')
print('-'*50)
print(f'seaborn_Ver         :   {sns.__version__}버전')
print('-'*50)
print(f'torch_Ver           :   {torch.__version__}버전')
print('-'*50)
print(f'torchvision_Ver     :   {torchvision.__version__}버전')
print('-'*50)
print(f'cpu_count           :   {os.cpu_count()}코어')

numpy_Ver           :   1.23.5버전
--------------------------------------------------
pandas_Ver          :   1.5.3버전
--------------------------------------------------
seaborn_Ver         :   0.12.2버전
--------------------------------------------------
torch_Ver           :   2.1.0+cu118버전
--------------------------------------------------
torchvision_Ver     :   0.16.0+cu118버전
--------------------------------------------------
cpu_count           :   2코어


# 1. 전처리

**피쳐 상세**

| 피쳐          | 설명                                                |
|---------------|-----------------------------------------------------|
| id            | time step 기반 식별자                               |
| breath_id     | 호흡에 대한 식별자                                  |
| R(resistance) | 폐의 입구크기(저항성:공기를 많이받냐,적게받냐)      |
| C(Compliance) | 폐의 최대용량(순응성:호흡이들어갈수있는크기의한계)  |
| time_step     | 실제 시간 기록                                      |
| u_in          | 공기 입력값 0~100                                   |
| u_out         | 흡기/배기 여부 0~1 (0은흡기,1은배기)                |
| `pressure`    | `폐의압력, cmH2O 단위로 측정`                       |

전처리 : SINO
1. Data___Shape_____Check
2. Data___InfoType__Check
3. Data___Null______Check
4. Data___Outlier___Check

In [6]:
#==================================================
# 데이터확인
#==================================================
display('df_train   :',df_train)
display('df_test    :',df_test)

'df_train   :'

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


'df_test    :'

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


In [7]:
#==================================================
# 형상확인
#==================================================
print('df_train   :',df_train.shape)

df_train   : (6036000, 8)


In [8]:
#==================================================
# 타입확인
#==================================================
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6036000 entries, 0 to 6035999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
 7   pressure   float64
dtypes: float64(3), int64(5)
memory usage: 368.4 MB
None


In [9]:
#==================================================
# 결측치확인
#==================================================
display(df_train.isnull().sum().to_frame().reset_index())

Unnamed: 0,index,0
0,id,0
1,breath_id,0
2,R,0
3,C,0
4,time_step,0
5,u_in,0
6,u_out,0
7,pressure,0


In [10]:
#==================================================
# 이상치확인
#==================================================
display(df_train.describe())

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
count,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0
mean,3018000.0,62838.86,27.03618,26.08072,1.307225,7.321615,0.6204493,11.22041
std,1742443.0,36335.26,19.59549,17.15231,0.7659778,13.4347,0.4852752,8.109703
min,1.0,1.0,5.0,10.0,0.0,0.0,0.0,-1.895744
25%,1509001.0,31377.0,5.0,10.0,0.6428995,0.3936623,0.0,6.329607
50%,3018000.0,62765.5,20.0,20.0,1.308123,4.386146,1.0,7.032628
75%,4527000.0,94301.0,50.0,50.0,1.965502,4.983895,1.0,13.64103
max,6036000.0,125749.0,50.0,50.0,2.937238,100.0,1.0,64.82099


# 2. EDA
---------------------------------------------------------

In [11]:
!pip install -q dataprep

In [12]:
from sklearn.metrics import mean_squared_error
from dataprep.eda import *
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda import plot_missing
import plotly.express as px
import plotly.figure_factory as ff
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.3)

In [13]:
#  데이터 살펴보기
#-------------------------------------------

#  ▶ 불필요한 피쳐가 있는지 확인해보자


#  데이터 시각화
#-------------------------------------------
#  ▶ 그래프를 통해서 각 피쳐를 살펴보자


#  가설수립
#-------------------------------------------

#  ▶ 가설1. R이 작을수록 압력이 높을것이다

#  ▶ 가설2. C가 적을수록 압력이 높을것이다

#  ▶ 가설3. u_in의 수치가 높을수록 압력이 높을것이다

#  ▶ 가설4. u_out은 압력에 연관성이 없을것이다.


###  데이터 살펴보기
--------------------------------------------------

In [14]:
#  ▶ 1. 불필요한 피쳐가 있는지 확인해보자

# id는 인덱스와 같다 (제거해도됨)
df_id = pd.DataFrame(df_train.id.value_counts()).reset_index(drop=True)
print('df_id 고유피쳐수: ',df_id.nunique().sum(),'개')
df_id = pd.DataFrame(df_id.id.sort_values())
display(df_id)

# df_breath_id은 고유 식별자
df_breath_id = pd.DataFrame(df_train.breath_id.value_counts()).reset_index()
df_breath_id = pd.DataFrame(df_breath_id.breath_id.sort_values())
print('breath_id 고유피쳐수: ',df_breath_id.nunique().sum(),'개')
display(df_breath_id)

df_id 고유피쳐수:  1 개


Unnamed: 0,id
0,1
4024004,1
4024003,1
4024002,1
4024001,1
...,...
2011996,1
2011995,1
2011994,1
2012003,1


breath_id 고유피쳐수:  1 개


Unnamed: 0,breath_id
0,80
50302,80
50301,80
50300,80
50299,80
...,...
25148,80
25147,80
25146,80
25162,80


###  데이터 시각화
------------------------------------------

In [15]:
# 불필요한 컬럼 제거
df_train.drop('id', axis=1, inplace=True)
df_train.drop('breath_id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)
df_test.drop('breath_id', axis=1, inplace=True)
print(df_train.head())
print("*************************************")
print(df_test.head())

    R   C  time_step       u_in  u_out   pressure
0  20  50   0.000000   0.083334      0   5.837492
1  20  50   0.033652  18.383041      0   5.907794
2  20  50   0.067514  22.509278      0   7.876254
3  20  50   0.101542  22.808822      0  11.742872
4  20  50   0.135756  25.355850      0  12.234987
*************************************
   R   C  time_step       u_in  u_out
0  5  20   0.000000   0.000000      0
1  5  20   0.031904   7.515046      0
2  5  20   0.063827  14.651675      0
3  5  20   0.095751  21.230610      0
4  5  20   0.127644  26.320956      0


In [18]:
# 전체 훈련 데이터 체크
plot(df_train)

Output hidden; open in https://colab.research.google.com to view.

In [19]:
# R 피쳐 체크
plot(df_train, 'R')

Output hidden; open in https://colab.research.google.com to view.

In [20]:
# C 피쳐 체크
plot(df_train, 'C')

Output hidden; open in https://colab.research.google.com to view.

In [21]:
plot(df_train, 'time_step')

Output hidden; open in https://colab.research.google.com to view.

In [22]:
plot(df_train, 'u_in')

Output hidden; open in https://colab.research.google.com to view.

In [23]:
plot(df_train, 'u_out')

Output hidden; open in https://colab.research.google.com to view.

In [24]:
plot(df_train, 'pressure')

Output hidden; open in https://colab.research.google.com to view.

In [25]:
create_report(df_train)

Output hidden; open in https://colab.research.google.com to view.

In [32]:
create_report(df_test)

Output hidden; open in https://colab.research.google.com to view.

In [27]:
plot_correlation(df_train)

Output hidden; open in https://colab.research.google.com to view.

In [28]:
df_train.skew()

R            0.161473
C            0.565975
time_step    0.011604
u_in         3.912228
u_out       -0.496417
pressure     1.818959
dtype: float64

In [29]:
df_test.skew()

R            0.158385
C            0.565272
time_step    0.011585
u_in         3.907144
u_out       -0.496142
dtype: float64

In [30]:
df_train

Unnamed: 0,R,C,time_step,u_in,u_out,pressure
0,20,50,0.000000,0.083334,0,5.837492
1,20,50,0.033652,18.383041,0,5.907794
2,20,50,0.067514,22.509278,0,7.876254
3,20,50,0.101542,22.808822,0,11.742872
4,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...
6035995,50,10,2.504603,1.489714,1,3.869032
6035996,50,10,2.537961,1.488497,1,3.869032
6035997,50,10,2.571408,1.558978,1,3.798729
6035998,50,10,2.604744,1.272663,1,4.079938


###  가설수립
-------------------------------------------

In [None]:
#  ▶ 가설1. R이 작을수록 압력이 높을것이다
# 맞는듯 하지만 u_in에비해 영향력은 낮을것으로 판단됨
#  ▶ 가설2. C가 적을수록 압력이 높을것이다
# 맞는듯  하지만  u_in에비해 영향력은 낮을것으로 판단됨
#  ▶ 가설3. u_in의 수치가 높을수록 압력이 높을것이다
# 맞는듯
#  ▶ 가설4. u_out은 압력에 연관성이 없을것이다.
# 맞는듯

In [31]:
##### DL START ######