In [4]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
ingredientevents_test = pd.read_csv('data/test/ingredientevents_test.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# ingredientevents_test.csv에서 고유한 itemid 추출
unique_itemids = ingredientevents_test['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_d_items = d_items[d_items['itemid'].isin(unique_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_data_filtered = ingredientevents_test.merge(filtered_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 amount의 평균을 계산
print("평균 계산 중...")
average_amounts_hadm_filtered = merged_data_filtered.groupby(['hadm_id', 'itemid'])['amount'].mean().reset_index()
average_amounts_hadm_filtered.rename(columns={'amount': 'average_amount'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_amounts_hadm_pivot_filtered = average_amounts_hadm_filtered.pivot(index='hadm_id', columns='itemid', values='average_amount')

# 결측치를 -1으로 채움
average_amounts_hadm_pivot_filtered = average_amounts_hadm_pivot_filtered.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
filtered_itemid_to_label = dict(zip(filtered_d_items['itemid'], filtered_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_amounts_hadm_pivot_filtered.columns):
    new_columns.append(f"{filtered_itemid_to_label[int(col)]}_average_amount")
average_amounts_hadm_pivot_filtered.columns = new_columns

# 데이터프레임 인덱스 초기화
average_amounts_hadm_pivot_filtered.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_amounts_hadm_pivot_filtered.to_csv('data/test_preprocessed/average_amounts_per_item_per_admission_ingredient.csv', index=False, encoding='utf-8')

# 최종 데이터 확인
average_amounts_hadm_pivot_filtered


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 16/16 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Calcium (ingr)_average_amount,Carbohydrates_average_amount,Glucose (ingr)_average_amount,Kilogram calory_average_amount,Kilojoule_average_amount,Protein_average_amount,Water_average_amount,Calories_average_amount,Enteral Nutrition Ingredient_average_amount,OR Intake_ingr_average_amount,PACU Intake_ingr_average_amount,Blood Ingredient_average_amount,IVF ingredient_average_amount,Oral/Gastric Ingredient_average_amount,Parenteral Nutrition Ingredient_average_amount,Supplement Ingredient_average_amount
0,20013945,-1.000,-1.0,-1.0,-1.0,-1.0,-1.000000,250.829503,14.757199,-1.000000,2500.00,-1.0,-1.000000,263.716478,241.304348,-1.0,-1.0
1,20033924,-1.000,-1.0,-1.0,-1.0,-1.0,-1.000000,143.141146,11.899075,-1.000000,942.75,-1.0,307.833331,101.565283,208.888889,-1.0,-1.0
2,20055820,0.018,-1.0,-1.0,-1.0,-1.0,-1.000000,205.570848,12.928574,-1.000000,-1.00,-1.0,-1.000000,58.003432,186.470588,-1.0,-1.0
3,20070455,0.027,-1.0,-1.0,-1.0,-1.0,45.404598,118.718329,113.277333,348.982290,-1.00,-1.0,-1.000000,110.203866,99.814815,-1.0,-1.0
4,20087467,-1.000,-1.0,-1.0,-1.0,-1.0,-1.000000,325.751368,67.373398,-1.000000,-1.00,-1.0,49.999999,325.312692,-1.000000,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,29914435,-1.000,-1.0,-1.0,-1.0,-1.0,-1.000000,178.824786,20.090909,-1.000000,-1.00,-1.0,-1.000000,159.068627,105.714286,-1.0,-1.0
514,29923776,0.018,-1.0,-1.0,-1.0,-1.0,8.196842,79.654646,35.460329,121.255076,-1.00,-1.0,172.222221,71.289007,79.687500,-1.0,-1.0
515,29942526,0.018,-1.0,-1.0,-1.0,-1.0,41.831356,162.670372,288.895409,655.663903,-1.00,-1.0,362.499998,159.793104,68.000000,-1.0,-1.0
516,29961069,-1.000,-1.0,-1.0,-1.0,-1.0,-1.000000,208.303224,21.250000,-1.000000,-1.00,-1.0,-1.000000,119.757872,235.000000,-1.0,-1.0


In [5]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
inputevents_test = pd.read_csv('data/test/inputevents_test.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# inputevents_test.csv에서 고유한 itemid 추출
unique_input_itemids = inputevents_test['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_input_d_items = d_items[d_items['itemid'].isin(unique_input_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_input_data = inputevents_test.merge(filtered_input_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 amount의 평균을 계산
print("평균 계산 중...")
average_amounts_hadm_input = merged_input_data.groupby(['hadm_id', 'itemid'])['amount'].mean().reset_index()
average_amounts_hadm_input.rename(columns={'amount': 'average_amount'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_amounts_hadm_pivot_input = average_amounts_hadm_input.pivot(index='hadm_id', columns='itemid', values='average_amount')

# 결측치를 -1으로 채움
average_amounts_hadm_pivot_input = average_amounts_hadm_pivot_input.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
input_itemid_to_label = dict(zip(filtered_input_d_items['itemid'], filtered_input_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_amounts_hadm_pivot_input.columns):
    new_columns.append(f"{input_itemid_to_label[int(col)]}_average_amount")
average_amounts_hadm_pivot_input.columns = new_columns

# 데이터프레임 인덱스 초기화
average_amounts_hadm_pivot_input.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_amounts_hadm_pivot_input.to_csv('data/test_preprocessed/average_amounts_per_item_per_admission_input_filtered.csv', index=False, encoding='utf-8')

# 최종 데이터 확인
average_amounts_hadm_pivot_input


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 224/224 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Albumin 25%_average_amount,Albumin 5%_average_amount,Dextrose 5%_average_amount,Dextrose 10%_average_amount,Dextrose 50%_average_amount,Fresh Frozen Plasma_average_amount,Sodium Bicarbonate 8.4%_average_amount,Nutren Renal (Full)_average_amount,Adenosine_average_amount,...,Esomeprazole (Nexium)_average_amount,Ondansetron (Zofran)_average_amount,Epinephrine._average_amount,Calcium Chloride_average_amount,Nicardipine 40mg/200_average_amount,Phenylephrine (50/250)_average_amount,Phenylephrine (200/250)_average_amount,Bumetanide (Bumex)_average_amount,Calcium Gluconate (Bolus)_average_amount,Amiodarone 450/250_average_amount
0,20013945,-1.000000,-1.000000,86.807051,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
1,20033924,49.999999,249.999990,41.508066,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,1.326531,-1.0,-1.0,-1.0,-1.0
2,20055820,-1.000000,-1.000000,76.050437,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
3,20070455,-1.000000,-1.000000,235.837797,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
4,20087467,49.999999,-1.000000,396.314100,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,29914435,-1.000000,-1.000000,118.181818,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
514,29923776,49.999999,499.999981,91.749820,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
515,29942526,-1.000000,-1.000000,142.857142,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0
516,29961069,-1.000000,-1.000000,125.000000,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0


In [6]:
import pandas as pd
from tqdm import tqdm

outputevents_test = pd.read_csv('data/test/outputevents_test.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

unique_output_itemids = outputevents_test['itemid'].unique()

# d_items.csv에서 해당 itemid에 해당하는 레코드만 필터링
filtered_output_d_items = d_items[d_items['itemid'].isin(unique_output_itemids)]

print("병합 중...")
merged_output_data = outputevents_test.merge(filtered_output_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 value의 평균을 계산
print("평균 계산 중...")
average_values_hadm_output = merged_output_data.groupby(['hadm_id', 'itemid'])['value'].mean().reset_index()
average_values_hadm_output.rename(columns={'value': 'average_value'}, inplace=True)

# 평균 값을 itemid별로 열로 구성하여 hadm_id별 데이터 피벗
average_values_hadm_pivot_output = average_values_hadm_output.pivot(index='hadm_id', columns='itemid', values='average_value')

# 결측치를 -1으로 채움
average_values_hadm_pivot_output = average_values_hadm_pivot_output.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성 (필터링된 itemid만 사용)
output_itemid_to_label = dict(zip(filtered_output_d_items['itemid'], filtered_output_d_items['label']))

print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_values_hadm_pivot_output.columns):
    new_columns.append(f"{output_itemid_to_label[int(col)]}_average_value")
average_values_hadm_pivot_output.columns = new_columns

average_values_hadm_pivot_output.reset_index(inplace=True)

print("CSV 파일 저장 중...")
average_values_hadm_pivot_output.to_csv('data/test_preprocessed/average_values_per_item_per_admission_output_filtered.csv', index=False, encoding='utf-8')

average_values_hadm_pivot_output

병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 56/56 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Foley_average_value,Void_average_value,Condom Cath_average_value,Suprapubic_average_value,R Nephrostomy_average_value,L Nephrostomy_average_value,Straight Cath_average_value,Anderson (gastric)_average_value,Emesis_average_value,...,PACU Gastric_average_value,PACU Urine_average_value,Cath Lab_average_value,Pre-Admission_average_value,GU Irrigant Volume In_average_value,GU Irrigant/Urine Volume Out_average_value,TF Residual_average_value,TF Residual Output_average_value,Drainage Bag_average_value,Chest Tube #3_average_value
0,20013945,218.750000,310.869565,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,2700.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
1,20033924,9.315789,100.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
2,20055820,133.181818,209.750000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,200.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
3,20070455,144.289100,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,30.0,25.0,0.833333,3.333333,-1.0,-1.0
4,20087467,213.953488,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,29914435,125.585859,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,575.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
495,29923776,133.761111,-1.000000,191.428571,-1.0,-1.0,-1.0,-1.0,-1.0,30.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,5.000000,-1.000000,-1.0,-1.0
496,29942526,138.196970,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0
497,29961069,152.651163,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,-1.000000,-1.0,-1.0


In [7]:
# import pandas as pd

# # Load the datasets
# procedureevents_df = pd.read_csv('final project dataset/procedureevents_train.csv')
# d_items_df = pd.read_csv('final project dataset/dictionary/d_items.csv')

# # itemid에 맞는 label을 추가하기 위해 두 데이터프레임 병합
# merged_df = procedureevents_df.merge(d_items_df[['itemid', 'label']], on='itemid', how='left')

# # 각 hadm_id별 label별 발생 횟수를 세고, 결측값을 0으로 채우기
# count_df = merged_df.groupby(['hadm_id', 'label']).size().unstack(fill_value=0)

# # tqdm으로 평균 계산 과정을 표시
# # 각 hadm_id별로 label의 평균 횟수 계산
# average_count_df = pd.DataFrame()
# for hadm_id, data in tqdm(count_df.groupby('hadm_id'), desc="Calculating averages"):
#     average = data.mean()
#     average['hadm_id'] = hadm_id
#     average_count_df = pd.concat([average_count_df, average.to_frame().T], ignore_index=True)

# # 컬럼 이름에 '_average_count' 접미사 추가
# average_count_df.columns = ['hadm_id'] + [f"{label}_average_count" for label in average_count_df.columns[1:]]

# # CSV 파일로 저장
# average_count_df.to_csv('Feature importance data/average_procedure_count.csv', index=False, encoding='utf-8')

# # 결과 출력
# average_count_df.head()
import pandas as pd

# Load the datasets
procedureevents_df = pd.read_csv('data/test/procedureevents_test.csv')
d_items_df = pd.read_csv('data/dictionary/d_items.csv')

# itemid에 맞는 label을 추가하기 위해 두 데이터프레임 병합
merged_df = procedureevents_df.merge(d_items_df[['itemid', 'label']], on='itemid', how='left')

# 각 hadm_id별 label별 발생 횟수를 세고, 결측값을 0으로 채우기
count_df = merged_df.groupby(['hadm_id', 'label']).size().unstack(fill_value=-1)

# 컬럼 이름에 '_count' 접미사 추가
count_df.columns = [f"{label}_count" for label in count_df.columns]

# 인덱스를 초기화하여 데이터프레임 변환
count_df.reset_index(inplace=True)

# CSV 파일로 저장
count_df.to_csv('data/test_preprocessed/procedure_count.csv', index=False, encoding='utf-8')

# 결과 출력
count_df


Unnamed: 0,hadm_id,14 Gauge_count,16 Gauge_count,18 Gauge_count,20 Gauge_count,22 Gauge_count,24 Gauge_count,AVA_count,Abdominal X-Ray_count,Angiography_count,...,Tunneled (Hickman) Line_count,Tunneled Access Line_count,Ultrasound_count,Unplanned Extubation (non-patient initiated)_count,Unplanned Extubation (patient-initiated)_count,Unplanned Line/Catheter Removal (Non-Patient initated)_count,Unplanned Line/Catheter Removal (Patient Initiated)_count,Urine Culture_count,Wound Culture_count,X-ray_count
0,20013945,-1,-1,1,1,2,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,20033924,-1,-1,-1,2,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,20055820,-1,-1,1,4,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,-1
3,20070455,-1,-1,1,4,-1,-1,-1,-1,-1,...,-1,-1,1,-1,-1,-1,-1,2,-1,1
4,20087467,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,29914435,-1,-1,1,4,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
514,29923776,-1,2,1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,2,-1,-1
515,29942526,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
516,29961069,-1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,-1,-1,-1,-1,-1,-1


In [8]:
import pandas as pd
from tqdm import tqdm

# 데이터 파일 로드
chartevents_test = pd.read_csv('data/test/chartevents_test.csv')
d_items = pd.read_csv('data/dictionary/d_items.csv')

# 'value' 열에서 문자열이 포함된 행 제거
chartevents_test = chartevents_test[chartevents_test['value'].apply(lambda x: str(x).replace('.', '', 1).isdigit())]

# 'value' 열을 수치형으로 변환
chartevents_test['value'] = chartevents_test['value'].astype(float)

# chartevents_test의 고유한 itemid 추출
unique_chartevent_itemids = chartevents_test['itemid'].unique()

# d_items에서 해당 itemid에 해당하는 레코드만 필터링
filtered_chartevent_d_items = d_items[d_items['itemid'].isin(unique_chartevent_itemids)]

# tqdm을 사용하여 병합 과정 표시
print("병합 중...")
merged_chartevent_data = chartevents_test.merge(filtered_chartevent_d_items[['itemid', 'label']], on='itemid', how='left')

# hadm_id와 itemid별로 value의 평균 계산
print("평균 계산 중...")
average_values_hadm_chartevent = merged_chartevent_data.groupby(['hadm_id', 'itemid'])['value'].mean().reset_index()
average_values_hadm_chartevent.rename(columns={'value': 'average_value'}, inplace=True)

# 피벗 테이블 생성
average_values_hadm_pivot_chartevent = average_values_hadm_chartevent.pivot(index='hadm_id', columns='itemid', values='average_value')
average_values_hadm_pivot_chartevent = average_values_hadm_pivot_chartevent.fillna(-1)

# itemid와 label을 매핑하는 딕셔너리 생성
chartevent_itemid_to_label = dict(zip(filtered_chartevent_d_items['itemid'], filtered_chartevent_d_items['label']))

# tqdm을 사용하여 컬럼 이름을 label로 변경
print("컬럼 이름 변경 중...")
new_columns = []
for col in tqdm(average_values_hadm_pivot_chartevent.columns):
    new_columns.append(f"{chartevent_itemid_to_label[int(col)]}_average_value")
average_values_hadm_pivot_chartevent.columns = new_columns

# 데이터프레임 인덱스 초기화
average_values_hadm_pivot_chartevent.reset_index(inplace=True)

# 결과를 CSV 파일로 저장
print("CSV 파일 저장 중...")
average_values_hadm_pivot_chartevent.to_csv('data/test_preprocessed/average_values_per_item_per_admission_chartevent_filtered.csv', index=False)

# 최종 데이터 확인
average_values_hadm_pivot_chartevent


병합 중...
평균 계산 중...
컬럼 이름 변경 중...


100%|██████████| 614/614 [00:00<?, ?it/s]

CSV 파일 저장 중...





Unnamed: 0,hadm_id,Heart Rate_average_value,Heart rate Alarm - High_average_value,Heart Rate Alarm - Low_average_value,Arterial Blood Pressure systolic_average_value,Arterial Blood Pressure diastolic_average_value,Arterial Blood Pressure mean_average_value,Arterial Blood Pressure Alarm - Low_average_value,Arterial Blood Pressure Alarm - High_average_value,Pulmonary Artery Pressure systolic_average_value,...,Resistance Exp_average_value,Resistance Insp_average_value,Indwelling Urinary Catheter Care_average_value,Vaulables Checklist_average_value,Creatinine (whole blood)_average_value,Forehead SpO2 Sensor in Place_average_value,No Pressure Injury Present_average_value,PICC Biopatch_average_value,Dialysis/Pheresis Biopatch_average_value,CVL Biopatch_average_value
0,20013945,68.549618,120.000000,49.615385,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,20033924,77.858696,127.000000,54.000000,116.931818,60.090909,78.931818,90.000000,140.000000,36.244444,...,14.6,14.2,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,20055820,62.989691,120.000000,48.000000,120.661765,60.220588,81.941176,85.625000,155.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,20070455,87.549521,120.000000,51.607143,110.594406,66.856643,84.859155,84.782609,157.826087,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,20087467,88.000000,120.000000,52.500000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,29914435,93.347107,124.615385,53.076923,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
516,29923776,85.570922,117.962963,57.407407,111.740741,60.185185,74.666667,89.473684,147.894737,45.352941,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
517,29942526,90.023438,120.000000,60.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
518,29961069,83.974359,120.000000,53.750000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [11]:
# icustays_test.csv 중복 hadm_id 하나의 행으로 합치기
icustays = pd.read_csv("data/test/icustays_test.csv")

# 같은 hadm_id를 기준으로 los를 합산
icustays_aggregated = icustays.groupby("hadm_id", as_index=False).agg({
    "los": "sum",  # los 합산
    # 다른 열도 처리 방식 정의 (예: 첫 번째 값 사용)
    # "column_name": "first"
})

# 새로운 파일로 저장
icustays_aggregated.to_csv("data/test_preprocessed/icustays_aggregated.csv", index=False)
print("Aggregated icustays 데이터가 저장되었습니다: data/test_preprocessed/icustays_aggregated.csv")

# 결과 확인
icustays_aggregated

Aggregated icustays 데이터가 저장되었습니다: data/test_preprocessed/icustays_aggregated.csv


Unnamed: 0,hadm_id,los
0,20013945,4.992164
1,20033924,3.120914
2,20055820,3.960266
3,20070455,11.632593
4,20087467,2.183646
...,...,...
515,29914435,4.936562
516,29923776,10.292060
517,29942526,5.430509
518,29961069,3.219549
