In [3]:
import pandas as pd
import numpy as np
import os

In [65]:
# Все параметры для расчета средних
PARAMS = [
    'Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol',
    'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT',
    'HR', 'K', 'Lactate', 'Mg', 'MAP', 'MechVent', 'Na', 'NIDiasABP',
    'NIMAP', 'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate',
    'SaO2', 'SysABP', 'Temp', 'TropI', 'TropT', 'Urine', 'WBC', 'Weight'
]

In [66]:
df = pd.read_csv('./set/132539.txt')

record = {}

static = df[df['Time'] == '00:00']

dynamic = df[df['Time'] != '00:00']
record['RecordID'] = int(static[static['Parameter'] == 'RecordID']['Value'].iloc[0])

for param in PARAMS:
    values = dynamic[dynamic['Parameter'] == param]['Value']

    # print("param", param)
    # print(values)
    if not values.empty:
        record[f'Mean{param}'] = pd.to_numeric(values, errors='coerce').mean()
    else:
        record[f'Mean{param}'] = np.nan
print(record)
# print(dynamic)

{'RecordID': 132539, 'MeanAlbumin': nan, 'MeanALP': nan, 'MeanALT': nan, 'MeanAST': nan, 'MeanBilirubin': nan, 'MeanBUN': np.float64(10.5), 'MeanCholesterol': nan, 'MeanCreatinine': np.float64(0.75), 'MeanDiasABP': nan, 'MeanFiO2': nan, 'MeanGCS': np.float64(14.923076923076923), 'MeanGlucose': np.float64(160.0), 'MeanHCO3': np.float64(27.0), 'MeanHCT': np.float64(32.5), 'MeanHR': np.float64(70.8108108108108), 'MeanK': np.float64(4.2), 'MeanLactate': nan, 'MeanMg': np.float64(1.7), 'MeanMAP': nan, 'MeanMechVent': nan, 'MeanNa': np.float64(136.5), 'MeanNIDiasABP': np.float64(50.14705882352941), 'MeanNIMAP': np.float64(71.55911764705883), 'MeanNISysABP': np.float64(114.38235294117646), 'MeanPaCO2': nan, 'MeanPaO2': nan, 'MeanpH': nan, 'MeanPlatelets': np.float64(203.0), 'MeanRespRate': np.float64(17.428571428571427), 'MeanSaO2': nan, 'MeanSysABP': nan, 'MeanTemp': np.float64(37.357142857142854), 'MeanTropI': nan, 'MeanTropT': nan, 'MeanUrine': np.float64(171.05263157894737), 'MeanWBC': np

In [78]:
def process_files(data_folder, outcomes_file):
    """Основная функция обработки"""
    # Загрузка
    outcomes = pd.read_csv(outcomes_file) if os.path.exists(outcomes_file) else None
    
    results = []
    
    # Обработка каждого файла пациента
    for file in os.listdir(data_folder):
        if not file.endswith('.txt'):
            continue
            
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path)
        
        # Статические данные (время 00:00)
        static = df[df['Time'] == '00:00']
        
        # Создание записи
        record = {}
        
        # Базовые поля
        record['RecordID'] = int(static[static['Parameter'] == 'RecordID']['Value'].iloc[0])
        record['Age'] = int(static[static['Parameter'] == 'Age']['Value'].iloc[0])
        record['Gender'] = int(static[static['Parameter'] == 'Gender']['Value'].iloc[0])
        record['Height'] = float(static[static['Parameter'] == 'Height']['Value'].iloc[0])
        record['ICUType'] = float(static[static['Parameter'] == 'ICUType']['Value'].iloc[0])
        
        
        # Средние значения параметров
        dynamic = df[df['Time'] != '00:00']
        
        for param in PARAMS:
            values = dynamic[dynamic['Parameter'] == param]['Value']

            if not values.empty:
                record[f'Mean{param}'] = pd.to_numeric(values, errors='coerce').mean()
            else:
                record[f'Mean{param}'] = np.nan
        

        
        # Добавление исходов
        if outcomes is not None:
            patient_outcome = outcomes[outcomes['RecordID'] == record['RecordID']]
            if not patient_outcome.empty:
                record['Survival'] = float(patient_outcome['Survival'].iloc[0])
                record['In_hospital_death'] = float(patient_outcome['In-hospital_death'].iloc[0])
        

        results.append(record)
    
    # Создание DataFrame
    result_df = pd.DataFrame(results)
    
    # Порядок колонок
    columns = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType']
    columns.extend([f'Mean{p}' for p in PARAMS])
    
    # Оставляем только существующие колонки
    existing_cols = [c for c in columns if c in result_df.columns]
    result_df = result_df[existing_cols]
    
    return result_df

In [77]:
# Использование
data_folder = "./set"
outcomes_file = "./Outcomes.txt"
# data_folder = "./../set"
# outcomes_file = "./../Outcomes_для_С1.txt"

# Обработка
df = process_files(data_folder, outcomes_file)

# Сохранение
df.to_csv('patients_summary.csv', index=False, float_format='%.10g')

print(f"Создан файл: patients_summary.csv")
print(f"Записей: {len(df)}")
print(f"Колонок: {len(df.columns)}")
print("\nПервые строки:")
print(df.head().to_string())

Создан файл: patients_summary.csv
Записей: 100
Колонок: 42

Первые строки:
   RecordID  Age  Gender  Height  ICUType  MeanAlbumin  MeanALP  MeanALT  MeanAST  MeanBilirubin    MeanBUN  MeanCholesterol  MeanCreatinine  MeanDiasABP  MeanFiO2    MeanGCS  MeanGlucose   MeanHCO3    MeanHCT     MeanHR  MeanK  MeanLactate    MeanMg    MeanMAP  MeanMechVent      MeanNa  MeanNIDiasABP  MeanNIMAP  MeanNISysABP  MeanPaCO2    MeanPaO2  MeanpH  MeanPlatelets  MeanRespRate   MeanSaO2  MeanSysABP   MeanTemp  MeanTropI  MeanTropT   MeanUrine    MeanWBC  MeanWeight
0    132539   54     0.0    -1.0      4.0          NaN      NaN      NaN      NaN            NaN  10.500000              NaN        0.750000          NaN       NaN  14.923077   160.000000  27.000000  32.500000  70.810811   4.20          NaN  1.700000        NaN           NaN  136.500000      50.147059  71.559118    114.382353        NaN         NaN     NaN     203.000000     17.428571        NaN         NaN  37.357143        NaN        NaN  1