In [14]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
ingredientevents_train = pd.read_csv('data/train/ingredientevents_train.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# ingredientevents_train.csv에서 고유한 itemid 추출
unique_itemids = ingredientevents_train['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_d_items = d_items[d_items['itemid'].isin(unique_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_data_filtered = ingredientevents_train.merge(filtered_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 amount의 평균을 계산
print("평균 계산 중...")
average_amounts_hadm_filtered = merged_data_filtered.groupby(['hadm_id', 'itemid'])['amount'].mean().reset_index()
average_amounts_hadm_filtered.rename(columns={'amount': 'average_amount'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_amounts_hadm_pivot_filtered = average_amounts_hadm_filtered.pivot(index='hadm_id', columns='itemid', values='average_amount')

# 결측치를 -1으로 채움
average_amounts_hadm_pivot_filtered = average_amounts_hadm_pivot_filtered.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
filtered_itemid_to_label = dict(zip(filtered_d_items['itemid'], filtered_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_amounts_hadm_pivot_filtered.columns):
    new_columns.append(f"{filtered_itemid_to_label[int(col)]}_average_amount")
average_amounts_hadm_pivot_filtered.columns = new_columns

# 데이터프레임 인덱스 초기화
average_amounts_hadm_pivot_filtered.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_amounts_hadm_pivot_filtered.to_csv('data/train_preprocessed/average_amounts_per_item_per_admission_ingredient.csv', index=False, encoding='utf-8')

# 최종 데이터 확인
average_amounts_hadm_pivot_filtered


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 16/16 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Calcium (ingr)_average_amount,Carbohydrates_average_amount,Glucose (ingr)_average_amount,Kilogram calory_average_amount,Kilojoule_average_amount,Protein_average_amount,Water_average_amount,Calories_average_amount,Enteral Nutrition Ingredient_average_amount,OR Intake_ingr_average_amount,PACU Intake_ingr_average_amount,Blood Ingredient_average_amount,IVF ingredient_average_amount,Oral/Gastric Ingredient_average_amount,Parenteral Nutrition Ingredient_average_amount,Supplement Ingredient_average_amount
0,20010074,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,309.551681,44.336188,-1.000000,-1.0,995.0,-1.000000,313.840182,266.666667,-1.000000,-1.0
1,20011505,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,145.877753,22.658544,-1.000000,-1.0,-1.0,261.715149,114.371826,166.842105,-1.000000,-1.0
2,20014219,0.035932,-1.000000,-1.000000,-1.0,-1.000000,21.797760,78.557907,111.982621,270.551817,-1.0,-1.0,316.666662,58.524380,68.349515,-1.000000,-1.0
3,20020562,0.018000,25000.001907,25000.001907,100.0,419.000002,-1.000000,328.714584,21.016236,-1.000000,1055.0,-1.0,609.306368,258.195365,180.000000,-1.000000,-1.0
4,20023734,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,298.916667,22.666667,-1.000000,-1.0,-1.0,-1.000000,298.916667,-1.000000,-1.000000,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,29964871,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,222.978723,22.913043,-1.000000,-1.0,-1.0,-1.000000,226.739130,50.000000,-1.000000,-1.0
2353,29966638,0.048134,-1.000000,-1.000000,-1.0,-1.000000,21.857319,159.940511,112.693566,159.651207,-1.0,-1.0,299.170131,132.406837,84.375000,861.983817,-1.0
2354,29966688,-1.000000,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,131.333333,-1.000000,-1.000000,-1.0,-1.0,-1.000000,49.999999,137.142857,-1.000000,-1.0
2355,29978132,0.022500,-1.000000,-1.000000,-1.0,-1.000000,-1.000000,230.743345,24.508923,-1.000000,-1.0,-1.0,333.299994,142.710122,-1.000000,1756.865094,-1.0


In [15]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
inputevents_train = pd.read_csv('data/train/inputevents_train.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# inputevents_train.csv에서 고유한 itemid 추출
unique_input_itemids = inputevents_train['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_input_d_items = d_items[d_items['itemid'].isin(unique_input_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_input_data = inputevents_train.merge(filtered_input_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 amount의 평균을 계산
print("평균 계산 중...")
average_amounts_hadm_input = merged_input_data.groupby(['hadm_id', 'itemid'])['amount'].mean().reset_index()
average_amounts_hadm_input.rename(columns={'amount': 'average_amount'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_amounts_hadm_pivot_input = average_amounts_hadm_input.pivot(index='hadm_id', columns='itemid', values='average_amount')

# 결측치를 -1으로 채움
average_amounts_hadm_pivot_input = average_amounts_hadm_pivot_input.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
input_itemid_to_label = dict(zip(filtered_input_d_items['itemid'], filtered_input_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_amounts_hadm_pivot_input.columns):
    new_columns.append(f"{input_itemid_to_label[int(col)]}_average_amount")
average_amounts_hadm_pivot_input.columns = new_columns

# 데이터프레임 인덱스 초기화
average_amounts_hadm_pivot_input.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_amounts_hadm_pivot_input.to_csv('data/train_preprocessed/average_amounts_per_item_per_admission_input_filtered.csv', index=False, encoding='utf-8')

# 최종 데이터 확인
average_amounts_hadm_pivot_input


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 263/263 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Albumin 25%_average_amount,Albumin 5%_average_amount,Dextrose 5%_average_amount,Dextrose 10%_average_amount,Dextrose 50%_average_amount,Fresh Frozen Plasma_average_amount,Sodium Bicarbonate 8.4%_average_amount,Nutren Renal (Full)_average_amount,Impact (Full)_average_amount,...,Epinephrine._average_amount,Calcium Chloride_average_amount,Nicardipine 40mg/200_average_amount,Phenylephrine (50/250)_average_amount,Phenylephrine (200/250)_average_amount,Bumetanide (Bumex)_average_amount,Calcium Gluconate (Bolus)_average_amount,Amiodarone 450/250_average_amount,Angiotensin II (Giapreza)_average_amount,Angiotensin II (Giapreza)_average_amount.1
0,20010074,-1.0,-1.000000,161.946916,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,20011505,-1.0,-1.000000,161.364454,-1.0,-1.0,-1.0,73.875,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,20014219,-1.0,-1.000000,85.109727,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,20020562,-1.0,499.999981,96.693463,100.5,50.0,532.5,150.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,20023734,-1.0,-1.000000,133.333333,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2352,29964871,-1.0,-1.000000,134.782609,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2353,29966638,-1.0,-1.000000,100.000000,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2354,29966688,-1.0,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2355,29978132,-1.0,-1.000000,123.057186,-1.0,-1.0,-1.0,-1.000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [16]:
import pandas as pd
from tqdm import tqdm

outputevents_train = pd.read_csv('data/train/outputevents_train.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

unique_output_itemids = outputevents_train['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_output_d_items = d_items[d_items['itemid'].isin(unique_output_itemids)]

print("병합 중...")
merged_output_data = outputevents_train.merge(filtered_output_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 value의 평균을 계산
print("평균 계산 중...")
average_values_hadm_output = merged_output_data.groupby(['hadm_id', 'itemid'])['value'].mean().reset_index()
average_values_hadm_output.rename(columns={'value': 'average_value'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_values_hadm_pivot_output = average_values_hadm_output.pivot(index='hadm_id', columns='itemid', values='average_value')

# 결측치를 -1으로 채움
average_values_hadm_pivot_output = average_values_hadm_pivot_output.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
output_itemid_to_label = dict(zip(filtered_output_d_items['itemid'], filtered_output_d_items['label']))

print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_values_hadm_pivot_output.columns):
    new_columns.append(f"{output_itemid_to_label[int(col)]}_average_value")
average_values_hadm_pivot_output.columns = new_columns

average_values_hadm_pivot_output.reset_index(inplace=True)

print("CSV 파일 저장 중...")
average_values_hadm_pivot_output.to_csv('data/train_preprocessed/average_values_per_item_per_admission_output_filtered.csv', index=False, encoding='utf-8')

average_values_hadm_pivot_output

병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 65/65 [00:00<00:00, 64148.18it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,L Ureteral Stent_average_value,Foley_average_value,Void_average_value,Condom Cath_average_value,Suprapubic_average_value,R Nephrostomy_average_value,L Nephrostomy_average_value,Straight Cath_average_value,Blakemore_average_value,...,PACU Urine_average_value,Cath Lab_average_value,Pre-Admission_average_value,GU Irrigant Volume In_average_value,GU Irrigant/Urine Volume Out_average_value,TF Residual_average_value,TF Residual Output_average_value,Drainage Bag_average_value,Chest Tube #3_average_value,Chest Tube #4_average_value
0,20010074,-1.0,252.954545,228.571429,-1.000000,-1.0,-1.0,-1.0,825.0,-1.0,...,880.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
1,20011505,-1.0,93.571429,337.500000,-1.000000,-1.0,-1.0,-1.0,700.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
2,20014219,-1.0,88.798295,295.625000,-1.000000,-1.0,-1.0,-1.0,450.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,26.571429,1000.0,-1.0,-1.0,-1.0
3,20020562,-1.0,73.693878,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
4,20023734,-1.0,-1.000000,155.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2242,29964871,-1.0,138.945946,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
2243,29966638,-1.0,18.986111,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,68.684211,-1.0,-1.0,-1.0,-1.0
2244,29966688,-1.0,-1.000000,236.666667,262.857143,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,2100.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
2245,29978132,-1.0,137.203390,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0


In [17]:
# import pandas as pd

# # Load the datasets
# procedureevents_df = pd.read_csv('final project dataset/procedureevents_train.csv')
# d_items_df = pd.read_csv('final project dataset/dictionary/d_items.csv')

# # itemid에 맞는 label을 추가하기 위해 두 데이터프레임 병합
# merged_df = procedureevents_df.merge(d_items_df[['itemid', 'label']], on='itemid', how='left')

# # 각 hadm_id별 label별 발생 횟수를 세고, 결측값을 0으로 채우기
# count_df = merged_df.groupby(['hadm_id', 'label']).size().unstack(fill_value=0)

# # tqdm으로 평균 계산 과정을 표시
# # 각 hadm_id별로 label의 평균 횟수 계산
# average_count_df = pd.DataFrame()
# for hadm_id, data in tqdm(count_df.groupby('hadm_id'), desc="Calculating averages"):
#     average = data.mean()
#     average['hadm_id'] = hadm_id
#     average_count_df = pd.concat([average_count_df, average.to_frame().T], ignore_index=True)

# # 컬럼 이름에 '_average_count' 접미사 추가
# average_count_df.columns = ['hadm_id'] + [f"{label}_average_count" for label in average_count_df.columns[1:]]

# # CSV 파일로 저장
# average_count_df.to_csv('Feature importance data/average_procedure_count.csv', index=False, encoding='utf-8')

# # 결과 출력
# average_count_df.head()
import pandas as pd

# Load the datasets
procedureevents_df = pd.read_csv('data/train/procedureevents_train.csv')
d_items_df = pd.read_csv('data/dictionary/d_items.csv')

# itemid에 맞는 label을 추가하기 위해 두 데이터프레임 병합
merged_df = procedureevents_df.merge(d_items_df[['itemid', 'label']], on='itemid', how='left')

# 각 hadm_id별 label별 발생 횟수를 세고, 결측값을 0으로 채우기
count_df = merged_df.groupby(['hadm_id', 'label']).size().unstack(fill_value=-1)

# 컬럼 이름에 '_count' 접미사 추가
count_df.columns = [f"{label}_count" for label in count_df.columns]

# 인덱스를 초기화하여 데이터프레임 변환
count_df.reset_index(inplace=True)

# CSV 파일로 저장
count_df.to_csv('data/train_preprocessed/procedure_count.csv', index=False, encoding='utf-8')

# 결과 출력
count_df


Unnamed: 0,hadm_id,14 Gauge_count,16 Gauge_count,18 Gauge_count,20 Gauge_count,22 Gauge_count,24 Gauge_count,AVA_count,Abdominal X-Ray_count,Angiography_count,...,Ultrasound_count,Unplanned Extubation (non-patient initiated)_count,Unplanned Extubation (patient-initiated)_count,Unplanned Line/Catheter Removal (Non-Patient initated)_count,Unplanned Line/Catheter Removal (Patient Initiated)_count,Urine Culture_count,VAC Change_count,Venogram_count,Wound Culture_count,X-ray_count
0,20010074,-1,-1,-1,2,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,20011505,-1,-1,1,3,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,20014219,-1,-1,1,4,4,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,20020562,-1,-1,-1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,20023734,-1,-1,1,2,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,29964871,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2361,29966638,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,-1,-1,-1,-1,1,-1,-1,-1,-1
2362,29966688,-1,-1,-1,2,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2363,29978132,-1,-1,-1,3,1,-1,-1,1,1,...,-1,-1,-1,-1,-1,1,-1,-1,-1,1


In [18]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
chartevents_train = pd.read_csv('data/train/chartevents_train.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# 'value' 열에서 문자열이 포함된 행 제거
chartevents_train = chartevents_train[chartevents_train['value'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]

# 'value' 열을 수치형으로 변환
chartevents_train['value'] = chartevents_train['value'].astype(float)

# chartevents_train의 고유한 itemid 추출
unique_chartevent_itemids = chartevents_train['itemid'].unique()

# d_items에서 해당 itemid에 해당하는 레코드만 필터링
filtered_chartevent_d_items = d_items[d_items['itemid'].isin(unique_chartevent_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_chartevent_data = chartevents_train.merge(filtered_chartevent_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 value의 평균 계산
print("평균 계산 중...")
average_values_hadm_chartevent = merged_chartevent_data.groupby(['hadm_id', 'itemid'])['value'].mean().reset_index()
average_values_hadm_chartevent.rename(columns={'value': 'average_value'}, inplace=True)

# 피벗 테이블 생성
average_values_hadm_pivot_chartevent = average_values_hadm_chartevent.pivot(index='hadm_id', columns='itemid', values='average_value')
average_values_hadm_pivot_chartevent = average_values_hadm_pivot_chartevent.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성
chartevent_itemid_to_label = dict(zip(filtered_chartevent_d_items['itemid'], filtered_chartevent_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_values_hadm_pivot_chartevent.columns):
    new_columns.append(f"{chartevent_itemid_to_label[int(col)]}_average_value")
average_values_hadm_pivot_chartevent.columns = new_columns

# 데이터프레임 인덱스 초기화
average_values_hadm_pivot_chartevent.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_values_hadm_pivot_chartevent.to_csv('data/train_preprocessed/average_values_per_item_per_admission_chartevent_filtered.csv', index=False)

# 최종 데이터 확인
average_values_hadm_pivot_chartevent


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 681/681 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Heart Rate_average_value,Heart rate Alarm - High_average_value,Heart Rate Alarm - Low_average_value,Arterial Blood Pressure systolic_average_value,Arterial Blood Pressure diastolic_average_value,Arterial Blood Pressure mean_average_value,Arterial Blood Pressure Alarm - Low_average_value,Arterial Blood Pressure Alarm - High_average_value,Pulmonary Artery Pressure systolic_average_value,...,Mean BP (VAD)_average_value,Pulsatility Index (VAD)_average_value,Speed (VAD)_average_value,Power (Watts) (VAD)_average_value,Forehead SpO2 Sensor in Place_average_value,Forehead Sensor Position Changed_average_value,No Pressure Injury Present_average_value,PICC Biopatch_average_value,Dialysis/Pheresis Biopatch_average_value,CVL Biopatch_average_value
0,20010074,88.115789,130.000000,50.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,20011505,92.343066,142.000000,41.00000,110.037736,62.641509,77.641509,82.500000,160.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,20014219,80.594646,132.564103,50.25641,104.839590,58.931741,74.546075,77.222222,101.666667,56.84,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,20020562,94.017241,131.428571,50.00000,97.028169,46.957143,64.521739,67.250000,155.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,20023734,91.526316,130.000000,50.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,29964871,90.500000,120.000000,50.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2374,29966638,96.444882,121.666667,61.25000,123.712821,57.733333,81.471795,90.000000,161.428571,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2375,29966688,60.170732,120.000000,50.00000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2376,29978132,101.497512,129.259259,50.00000,102.932773,59.436975,72.000000,89.230769,160.000000,-1.00,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [19]:
# icustays_train.csv 중복 hadm_id 하나의 행으로 합치기
icustays = pd.read_csv("data/train/icustays_train.csv")

# 같은 hadm_id를 기준으로 los를 합산
icustays_aggregated = icustays.groupby("hadm_id", as_index=False).agg({
    "los": "sum",  # los 합산
    # 다른 열도 처리 방식 정의 (예: 첫 번째 값 사용)
    # "column_name": "first"
})

# 새로운 파일로 저장
icustays_aggregated.to_csv("data/icustays_aggregated.csv", index=False)
print("Aggregated icustays 데이터가 저장되었습니다: data/train_preprocessed/icustays_aggregated.csv")

# 결과 확인
icustays_aggregated

Aggregated icustays 데이터가 저장되었습니다: data/train_preprocessed/icustays_aggregated.csv


Unnamed: 0,hadm_id,los
0,20010074,2.589410
1,20011505,5.108530
2,20014219,19.133669
3,20020562,3.671887
4,20023734,0.793576
...,...,...
2373,29964871,2.555000
2374,29966638,10.394236
2375,29966688,1.570995
2376,29978132,7.429630
