In [1]:
import pandas as pd

# 从 CSV 文件加载 ICD-9 数据
diagnoses = pd.read_csv('hosp/diagnoses_icd.csv.gz')

stroke_icd9_codes = ['43491', '431', '4359']  # 中风相关ICD-9代码

# 筛选卒中患者
stroke_data = diagnoses[diagnoses['icd_code'].astype(str).isin(stroke_icd9_codes)]

# 加载生命体征或住院数据（假设存在 hospital_stay 和 blood_pressure 等列）
patient_data = pd.read_csv('hosp/patients.csv.gz')

# 将诊断数据与其他患者数据关联
merged_data = pd.merge(stroke_data, patient_data, on='subject_id', how='inner')
print(merged_data.head())


   subject_id   hadm_id  seq_num icd_code  icd_version gender  anchor_age  \
0    10025463  24470193        1      431            9      M          66   
1    10032725  20611640        2      431            9      F          38   
2    10017492  27672872        1    43491            9      M          84   
3    10017492  27417763        3      431            9      M          84   
4    10004733  27411876        2    43491            9      M          51   

   anchor_year anchor_year_group         dod  
0         2136       2011 - 2013  2137-10-09  
1         2143       2011 - 2013  2143-03-30  
2         2114       2011 - 2013  2116-07-05  
3         2114       2011 - 2013  2116-07-05  
4         2174       2014 - 2016         NaN  


In [2]:
# 加载d_items表，查看所有监测数据项的item_id和描述
d_items = pd.read_csv('icu/d_items.csv.gz')

# 查找与血压相关的监测项
blood_pressure_items = d_items[d_items['label'].str.contains('blood pressure', case=False)]
print(blood_pressure_items)

      itemid                                     label         abbreviation  \
96    227539           ART Blood Pressure Alarm Source  ART BP Alarm Source   
2055  220058      Arterial Blood Pressure Alarm - High     ABP Alarm - High   
2057  220056       Arterial Blood Pressure Alarm - Low      ABP Alarm - Low   
2061  223752   Non-Invasive Blood Pressure Alarm - Low      NBP Alarm - Low   
2073  227538            ART Blood Pressure Alarm - Low   ART BP Alarm - Low   
2074  227537           ART Blood Pressure Alarm - High  ART BP Alarm - High   
2078  223751  Non-Invasive Blood Pressure Alarm - High     NBP Alarm - High   
2566  220052              Arterial Blood Pressure mean                 ABPm   
2568  227242     Manual Blood Pressure Diastolic Right         Manual BPd R   
2569  220180     Non Invasive Blood Pressure diastolic                 NBPd   
2571  227243      Manual Blood Pressure Systolic Right         Manual BPs R   
2572  220051         Arterial Blood Pressure diastol

In [3]:
import pandas as pd

# 加载 chartevents 表
chartevents = pd.read_csv('icu/chartevents.csv.gz')

# 筛选与血压相关的数据（使用 item_id）
blood_pressure_ids = [220050, 220051, 220052, 220179, 220180, 220181]
bp_data = chartevents[chartevents['itemid'].isin(blood_pressure_ids)]

# 查看前几行数据
print(bp_data.head())


     subject_id   hadm_id   stay_id  caregiver_id            charttime  \
40     10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
65     10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
96     10005817  20626031  32604416        6770.0  2132-12-16 00:00:00   
107    10005817  20626031  32604416        6770.0  2132-12-16 01:00:00   
113    10005817  20626031  32604416        6770.0  2132-12-16 01:00:00   

40   2132-12-16 00:02:00  220051    37      37.0     mmHg      0.0  
65   2132-12-16 00:02:00  220052    58      58.0     mmHg      0.0  
96   2132-12-16 00:02:00  220050   117     117.0     mmHg      0.0  
107  2132-12-16 01:04:00  220052    63      63.0     mmHg      0.0  
113  2132-12-16 01:04:00  220051    40      40.0     mmHg      0.0  


In [4]:
# 假设 stroke_data 已经从 diagnoses_icd.csv 中筛选出来
merged_data = pd.merge(stroke_data, bp_data, on=['subject_id', 'hadm_id'], how='inner')

# 查看合并后的数据
print(merged_data.head())


   subject_id   hadm_id  seq_num icd_code  icd_version   stay_id  \
0    10025463  24470193        1      431            9  38275267   
1    10025463  24470193        1      431            9  38275267   
2    10025463  24470193        1      431            9  38275267   
3    10025463  24470193        1      431            9  38275267   
4    10025463  24470193        1      431            9  38275267   

   caregiver_id            charttime            storetime  itemid value  \
0       34766.0  2137-10-09 03:00:00  2137-10-09 03:27:00  220052   102   
1       34766.0  2137-10-09 03:00:00  2137-10-09 03:27:00  220051    80   
2       34766.0  2137-10-09 03:00:00  2137-10-09 03:27:00  220050   134   
3       34766.0  2137-10-09 04:00:00  2137-10-09 04:28:00  220050   144   
4       34766.0  2137-10-09 04:00:00  2137-10-09 04:28:00  220051    82   

0     102.0     mmHg      0.0  
1      80.0     mmHg      0.0  
2     134.0     mmHg      0.0  
3     144.0     mmHg      0.0  
4      82.0 

In [5]:
# 将 'value' 列转换为浮点数类型，无法转换的值将被设置为 NaN
merged_data['value'] = pd.to_numeric(merged_data['value'], errors='coerce')

# 查看转换后的数据类型和前几行
print(merged_data['value'].dtype)
print(merged_data['value'].head())


int64
0    102
1     80
2    134
3    144
4     82
Name: value, dtype: int64


In [6]:
# 填充缺失的血压数据（以均值填充）
merged_data['value'].fillna(merged_data['value'].mean(), inplace=True)

# 筛选血压低于90的患者群体（假设‘value’表示收缩压）
high_risk_group = merged_data[merged_data['value'] < 90]

# 查看高危患者数据
print(high_risk_group.head())


    subject_id   hadm_id  seq_num icd_code  icd_version   stay_id  \
1     10025463  24470193        1      431            9  38275267   
4     10025463  24470193        1      431            9  38275267   
8     10025463  24470193        1      431            9  38275267   
9     10025463  24470193        1      431            9  38275267   
12    10025463  24470193        1      431            9  38275267   

    caregiver_id            charttime            storetime  itemid  value  \
1        34766.0  2137-10-09 03:00:00  2137-10-09 03:27:00  220051     80   
4        34766.0  2137-10-09 04:00:00  2137-10-09 04:28:00  220051     82   
8        34766.0  2137-10-09 06:00:00  2137-10-09 06:03:00  220051     70   
9        51676.0  2137-10-09 07:00:00  2137-10-09 07:13:00  220051     71   
12       51676.0  2137-10-09 08:00:00  2137-10-09 08:01:00  220051     70   

1       80.0     mmHg      0.0  
4       82.0     mmHg      0.0  
8       70.0     mmHg      0.0  
9       71.0     mmHg  

In [7]:
print(high_risk_group.columns)


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
       'stay_id', 'caregiver_id', 'charttime', 'storetime', 'itemid', 'value',
      dtype='object')


In [8]:
# 加载 admissions 数据表
admissions = pd.read_csv('hosp/admissions.csv.gz')

# 合并 admissions 表中的存活/死亡状态（hospital_expire_flag）
merged_data = pd.merge(high_risk_group, admissions[['subject_id', 'hadm_id', 'hospital_expire_flag']], 
                       on=['subject_id', 'hadm_id'], how='inner')

# 将 hospital_expire_flag 作为目标列
merged_data = merged_data.rename(columns={'hospital_expire_flag': 'target'})

# 确认合并后的数据是否有 'target' 列
print(merged_data.columns)


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
       'stay_id', 'caregiver_id', 'charttime', 'storetime', 'itemid', 'value',
      dtype='object')


In [9]:
from sklearn.preprocessing import StandardScaler

# 对血压值进行标准化
scaler = StandardScaler()
high_risk_group['value'] = scaler.fit_transform(high_risk_group[['value']])

# 划分特征和目标列
X = merged_data.drop('target', axis=1)
y = merged_data['target']


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 初始化并训练随机森林模型
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 打印混淆矩阵和分类报告
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


ValueError: could not convert string to float: '2143-03-22 13:00:00'