In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
import pandas as pd

base = "/content/drive/MyDrive/DILAB/MARS/mimiciv_3.1/files/hosp"

# 1) 진단, 서비스, ICD 사전 로드
diagnoses = pd.read_csv(f'{base}/diagnoses_icd.csv.gz')
services = pd.read_csv(f'{base}/services.csv.gz')
dicd = pd.read_csv(f'{base}/d_icd_diagnoses.csv.gz')

# 2) 진단 + 서비스(입원 기준으로)
diag_serv = diagnoses.merge(
    services[['subject_id', 'hadm_id', 'curr_service']],
    on=['subject_id', 'hadm_id'],
    how='left'
)

# 3) ICD 코드 -> 병명 텍스트 붙이기
diag_serv_name = diag_serv.merge(
    dicd,
    on=['icd_code', 'icd_version'],
    how='left'
)

diag_serv_name.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,curr_service,long_title
0,10000032,22595853,1,5723,9,MED,Portal hypertension
1,10000032,22595853,2,78959,9,MED,Other ascites
2,10000032,22595853,3,5715,9,MED,Cirrhosis of liver without mention of alcohol
3,10000032,22595853,4,7070,9,MED,Unspecified viral hepatitis C without hepatic ...
4,10000032,22595853,5,496,9,MED,"Chronic airway obstruction, not elsewhere clas..."


In [12]:
# 병명에 따른 과 매핑
# seq_num = 1인 애들이 주 진단임. 주 진단만 남기고 나머진 버리기
primary = diag_serv_name[diag_serv_name['seq_num'] == 1].copy()

# 주 진단별로 어떤 진료과가 많은지 집계
service_count = (
    primary.groupby(['long_title', 'curr_service'])
           .size()
           .reset_index(name='count')
)

# 3) 각 병명에서 등장 빈도가 가장 높은 진료과 선택
top_service = (
    service_count.sort_values(['long_title', 'count'], ascending=[True, False])
                 .groupby('long_title')
                 .first()
                 .reset_index()
)

top_service.head()


Unnamed: 0,long_title,curr_service,count
0,(Idiopathic) normal pressure hydrocephalus,NSURG,24
1,(Induced) termination of pregnancy with other ...,GYN,1
2,(Induced) termination of pregnancy with unspec...,OBS,1
3,2-part displaced fracture of surgical neck of ...,MED,2
4,2-part displaced fracture of surgical neck of ...,MED,1


In [16]:
primary = diag_serv_name[diag_serv_name['seq_num'] == 1]

dept_top = (
    primary.groupby(['curr_service', 'long_title'])
           .size()
           .reset_index(name='count')
)

# 전체에서 count가 가장 많은 순으로 10개만 보기
top10_global = (
    dept_top.sort_values('count', ascending=False)
            .head(10)
)

print(top10_global.to_string(index=False))


curr_service                                         long_title  count
         MED                            Chest pain, unspecified   9443
        OMED          Encounter for antineoplastic chemotherapy   5470
         MED                                   Other chest pain   5385
         MED                  Acute kidney failure, unspecified   5250
         MED                         Alcohol abuse, unspecified   4576
         MED        Urinary tract infection, site not specified   4491
         MED                       Sepsis, unspecified organism   4361
        CMED Coronary atherosclerosis of native coronary artery   4164
         MED                               Syncope and collapse   3490
         MED       Alcohol abuse with intoxication, unspecified   3438


In [7]:
top10_by_dept = (
    dept_top.groupby('curr_service')
            .head(10)  # 각 서비스별 상위 10개 병명
)

top10_by_dept


Unnamed: 0,curr_service,long_title,count
541,CMED,Coronary atherosclerosis of native coronary ar...,4164
1163,CMED,Non-ST elevation (NSTEMI) myocardial infarction,2692
1865,CMED,"Subendocardial infarction, initial episode of ...",2577
265,CMED,Atrial fibrillation,2356
844,CMED,Hypertensive heart and chronic kidney disease ...,2345
...,...,...,...
32274,VSURG,"Abdominal aortic aneurysm, without rupture",410
32271,VSURG,Abdominal aneurysm without mention of rupture,394
33110,VSURG,Type 2 diabetes mellitus with diabetic periphe...,333
33111,VSURG,Type 2 diabetes mellitus with diabetic periphe...,330
