In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# 读取数据C:\MyProject\MIMIC-IPE\data\所有合并疾病.csv
data = pd.read_csv('C:/MyProject/MIMIC-IPE/data/所有合并疾病.csv')

# 删除data中的列名后两位字符为“_1”的列
data = data.loc[:, ~data.columns.str.endswith('_1')]
# 筛选出与特定ICD-9和ICD-10代码匹配的行
# 移除了ICD代码中的小数点，并且不区分ICD版本
codes_icd9 = ['28981', '2864']
codes_icd10 = ['D685', 'D6851', 'D6859', 'D6861', 'D683']

'''
按照以下对应关系，将疾病编码整合
Primary Thrombocytopenia：28981
Disseminated Intravascular Coagulation (DIC)：2864
Thrombotic Thrombocytopenic Purpura：D685, D6851, D6859
Other Coagulation Defects：D6861
Antiphospholipid Syndrome：D683
'''
# 创建ICD编码到疾病名称的映射
icd_mapping = {
    '28981': 'Primary Thrombocytopenia',
    '2864': 'Disseminated Intravascular Coagulation (DIC)',
    'D685': 'Thrombotic Thrombocytopenic Purpura',
    'D6851': 'Thrombotic Thrombocytopenic Purpura',
    'D6859': 'Thrombotic Thrombocytopenic Purpura',
    'D6861': 'Other Coagulation Defects',
    'D683': 'Antiphospholipid Syndrome'
}

# 使用映射替换ICD编码
# data['icd_code'] = data['icd_code'].replace(icd_mapping)
data['Blood-related diseases'] = data['icd_code'].replace(icd_mapping)
# 将data中Blood-related diseases列不在以上范围内的值替换为字符串“No such disease”
# Get a boolean series representing which row satisfies the condition i.e. is in icd_mapping
seriesObj = data['Blood-related diseases'].isin(icd_mapping.values())
# Replace values where the condition is False
data['Blood-related diseases'] = data['Blood-related diseases'].where(seriesObj, other='No Blood-related disease')


'''
按照以下对应关系，将疾病编码整合
icd9_mapping = {
    '43411': 'Previous Stroke',
    '435': 'TIA',
    '4019': 'Hypertension',
    '41401': 'CAD',
    '4439': 'PAD',
    '4280': 'Heart failure',
    '42731': 'Atrial fibrillation',
    '25000': 'Diabetes'
}

icd10_mapping = {
    'I63': 'Previous Stroke',
    'G45': 'TIA',
    'I10': 'Hypertension',
    'I25': 'CAD',
    'I739': 'PAD',
    'I50': 'Heart failure',
    'I48': 'Atrial fibrillation',
    'E11': 'Diabetes'
}
'''
# 创建ICD编码到疾病名称的映射，将相同疾病的ICD编码整合
icd_mapping = {
    '43411': 'Previous Stroke',
    'I63': 'Previous Stroke',
    '435': 'TIA',
    'G45': 'TIA',
    '4019': 'Hypertension',
    'I10': 'Hypertension',
    '41401': 'CAD',
    'I25': 'CAD',
    '4439': 'PAD',
    'I739': 'PAD',
    '4280': 'Heart failure',
    'I50': 'Heart failure',
    '42731': 'Atrial fibrillation',
    'I48': 'Atrial fibrillation',
    '25000': 'Diabetes',
    'E11': 'Diabetes'
}
# 使用映射替换ICD编码
# data['icd_code'] = data['icd_code'].replace(icd_mapping)
data['Cardiovascular and metabolic diseases'] = data['icd_code'].replace(icd_mapping)
'''
参照以上代码的逻辑，将data中Cardiovascular and metabolic diseases列不在以上范围内的值替换为字符串“No such disease”'''
seriesObj = data['Cardiovascular and metabolic diseases'].isin(icd_mapping.values())
# Replace values where the condition is False
data['Cardiovascular and metabolic diseases'] = data['Cardiovascular and metabolic diseases'].where(seriesObj, other='No Cardiovascular and metabolic diseases')

'''
按照以下对应关系，将疾病编码整合：
Lung_Tumor（肺肿瘤）：162（特指肺部肿瘤）、C34（包括所有肺部肿瘤的子分类）
Other_Solid_Tumor（其他实体瘤）：140-209（实体瘤的广泛范围，但不包括血液和淋巴系统的肿瘤，即排除200-209，也不包括肺部肿瘤162）、C00-D48（实体瘤的广泛范围，但不包括淋巴和造血组织的肿瘤，即排除C81-C96，同时不包括肺部肿瘤C34）
Non_Solid_Tumor（非实体瘤）：200-209（专门指血液和淋巴系统的肿瘤，如淋巴瘤和白血病）、C81-C96（同样指血液和淋巴系统的肿瘤，包括各种类型的淋巴瘤和白血病）
'''
# 函数用于判断ICD代码是否在特定范围内，根据icd_code的前3位字符判断
def icd_range(icd_code, start, end):
    return icd_code[:3] >= start and icd_code[:3] <= end

# 创建ICD编码到疾病名称的映射
def map_icd_to_disease(icd_code):
    # Check if the ICD code falls within the range for Lung Tumor
    if icd_range(icd_code, '162', '162') or icd_range(icd_code, 'C34', 'C34'):
        return 'Lung_Tumor'
    # Check if the ICD code falls within the range for Breast Cancer
    elif icd_range(icd_code, '174', '174') or icd_range(icd_code, 'C50', 'C50'):
        return 'Breast_Cancer'
    # Check if the ICD code falls within the range for Colorectal Cancer
    elif icd_range(icd_code, '153', '154') or icd_range(icd_code, 'C18', 'C21'):
        return 'Colorectal_Cancer'
    # Check if the ICD code falls within the range for Prostate Cancer
    elif icd_range(icd_code, '185', '185') or icd_range(icd_code, 'C61', 'C61'):
        return 'Prostate_Cancer'
    # Check if the ICD code falls within the range for Stomach Cancer
    elif icd_range(icd_code, '151', '151') or icd_range(icd_code, 'C16', 'C16'):
        return 'Stomach_Cancer'
    # Check if the ICD code falls within the range for Other Solid Tumor
    elif (icd_range(icd_code, '140', '199') or icd_range(icd_code, '210', '209') or
          icd_range(icd_code, 'C00', 'C33') or icd_range(icd_code, 'C35', 'D48')):
        return 'Other_Solid_Tumor'
    # Check if the ICD code falls within the range for Non-Solid Tumor
    elif icd_range(icd_code, '200', '209') or icd_range(icd_code, 'C81', 'C96'):
        return 'Non_Solid_Tumor'
    else:
        return 'No tumor disease'

    
    
    
# 替换data中的icd_code列的值
data['icd_code'] = data['icd_code'].apply(map_icd_to_disease)

In [3]:
# 将data中icd_code更名为tumor_disease
data.rename(columns={'icd_code': 'tumor_disease'}, inplace=True)

In [4]:
# 查看data中的列，存入一个datafrmae
columns = pd.DataFrame(data.columns)
'''
data保留以下列：
subject_id
hadm_id
tumor_disease
Blood-related diseases
Cardiovascular and metabolic diseases
'''
data = data[['subject_id', 'hadm_id', 'tumor_disease', 'Blood-related diseases', 'Cardiovascular and metabolic diseases']]


In [5]:
# 将data中的行按照subject_id和hadm_id列中相同的值合并，如果tumor_disease列中一个id下对应的值全部为No tumor disease，则合并后的值为No tumor disease，否则为tumor_disease列中的任意非No tumor disease值。如果Blood-related diseases列中一个id下对应的值全部为No Blood-related disease，则合并后的值为No Blood-related disease，否则为Blood-related diseases列中的任意非No Blood-related disease值。如果Cardiovascular and metabolic diseases列中一个id下对应的值全部为No Cardiovascular and metabolic diseases，则合并后的值为No Cardiovascular and metabolic diseases，否则为Cardiovascular and metabolic diseases列中的任意非No Cardiovascular and metabolic diseases值。


# Custom aggregation function
def aggregate_disease(values, no_disease_label):
    if all(value == no_disease_label for value in values):
        return no_disease_label
    else:
        # Return any non-"No disease" value
        return next((value for value in values if value != no_disease_label), no_disease_label)

# Grouping and aggregating
data = data.groupby(['subject_id', 'hadm_id']).agg({
    'tumor_disease': lambda x: aggregate_disease(x, 'No tumor disease'),
    'Blood-related diseases': lambda x: aggregate_disease(x, 'No Blood-related disease'),
    'Cardiovascular and metabolic diseases': lambda x: aggregate_disease(x, 'No Cardiovascular and metabolic diseases')
}).reset_index()




In [6]:
# data增加一列count，值为1
data['count'] = 1
# pivot，首先pivot Blood-related diseases列，以subject_id和hadm_id为索引，count为值
blood_related_pivot = data.pivot_table(index=['subject_id', 'hadm_id'], columns='Blood-related diseases', values='count', aggfunc='sum', fill_value=0)
# pivot，然后pivot Cardiovascular and metabolic diseases列，以subject_id和hadm_id为索引，count为值
cardiovascular_metabolic_pivot = data.pivot_table(index=['subject_id', 'hadm_id'], columns='Cardiovascular and metabolic diseases', values='count', aggfunc='sum', fill_value=0)
# pivot，最后pivot tumor_disease列，以subject_id和hadm_id为索引，count为值
tumor_pivot = data.pivot_table(index=['subject_id', 'hadm_id'], columns='tumor_disease', values='count', aggfunc='sum', fill_value=0)
# 重置以上3个表的索引
blood_related_pivot.reset_index(inplace=True)
cardiovascular_metabolic_pivot.reset_index(inplace=True)
tumor_pivot.reset_index(inplace=True)

In [7]:

# 把以上3个表按照subject_id和hadm_id合并到data，以data的行数为基准，即左连接
data = pd.merge(data, blood_related_pivot, on=['subject_id', 'hadm_id'], how='left')
data = pd.merge(data, cardiovascular_metabolic_pivot, on=['subject_id', 'hadm_id'], how='left')
data = pd.merge(data, tumor_pivot, on=['subject_id', 'hadm_id'], how='left')



In [8]:
# 保存数据到C:\MyProject\MIMIC-IPE\data\anti_combine.csv
data.to_csv('C:/MyProject/MIMIC-IPE/data/anti_combine.csv', index=False)