Import Data:

In [39]:
import pandas as pd
import numpy as np
import datetime
pd.set_option('display.max_columns', 999)
import pandas.io.sql as psql
# plot a figure directly on Notebook
import matplotlib.pyplot as plt

In [41]:
admission_table = pd.read_csv("data/ADMISSIONS.csv")
subject_ids = list(admission_table['subject_id'])
# print(admission_table)
# print(subject_ids)
print(admission_table.shape)

(129, 19)


In [79]:
# Iterate through table
# Regular Table: subect_id: the whole row that is their first admssion
# During iteration, if subject_id is already in the newTableWeAreCreating, then we have seen the subject, we are doing a readmission right now
    # But, we have to check the time frame, because if its 30 days after the dictionary value, this is a new set of admission to consider
# first_admission_dataframe = pd.DataFrame(columns=admission_table.columns)
first_admission_dataframe = pd.DataFrame(columns=admission_table.columns)

# Iterate over rows
for index, row in admission_table.iterrows():
    # check if the subject_id has been admitted before
    if row['subject_id'] not in first_admission_dataframe['subject_id'].values:
        # Append the row to the new DataFrame
        first_admission_dataframe = pd.concat([first_admission_dataframe, pd.DataFrame([row])], 
                                              axis=0, ignore_index=True)

print(first_admission_dataframe['admittime'])
# # Reset the index of the new DataFrame
# first_admission_dataframe.reset_index(drop=True, inplace=True)

0     2164-10-23 21:09:00
1     2126-08-14 22:32:00
2     2125-10-04 23:36:00
3     2149-05-26 17:19:00
4     2163-05-14 20:43:00
             ...         
95    2112-05-04 08:00:00
96    2178-05-14 20:29:00
97    2123-11-24 14:14:00
98    2180-07-19 06:55:00
99    2170-12-15 03:14:00
Name: admittime, Length: 100, dtype: object


In [73]:
print(pd.__version__)

2.0.3


In [80]:
# If there are three admission back to back less than 30 days apart

Important features:

    Admissions table
        diagnosis
        admission_type

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

def determine_readmissions(mimic_data_path, time_window=30):
    # Load necessary tables from MIMIC-III dataset
    admissions = pd.read_csv(mimic_data_path + "/ADMISSIONS.csv")
    patients = pd.read_csv(mimic_data_path + "/PATIENTS.csv")
    diagnoses_icd = pd.read_csv(mimic_data_path + "/DIAGNOSES_ICD.csv")

    # Merge tables to get necessary features
    merged_data = admissions.merge(patients, on='SUBJECT_ID', how='inner')
    merged_data = merged_data.merge(diagnoses_icd, on='HADM_ID', how='inner')

    # Convert admission and discharge timestamps to datetime objects
    merged_data['ADMITTIME'] = pd.to_datetime(merged_data['ADMITTIME'])
    merged_data['DISCHTIME'] = pd.to_datetime(merged_data['DISCHTIME'])

    # Calculate readmission within time_window days
    merged_data.sort_values(['SUBJECT_ID', 'ADMITTIME'], inplace=True)
    merged_data['NEXT_ADMITTIME'] = merged_data.groupby('SUBJECT_ID')['ADMITTIME'].shift(-1)
    merged_data['DAYS_TO_NEXT_READMISSION'] = (merged_data['NEXT_ADMITTIME'] - merged_data['DISCHTIME']).dt.total_seconds() / (24 * 60 * 60)
    merged_data['READMITTED'] = merged_data['DAYS_TO_NEXT_READMISSION'].apply(lambda x: 1 if x <= time_window else 0)

    # Feature selection
    features = ['AGE', 'GENDER', 'DIAGNOSIS']
    selected_data = merged_data[features]

    # One-hot encoding for categorical variables
    selected_data = pd.get_dummies(selected_data, columns=['GENDER', 'DIAGNOSIS'])

    # Split dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(selected_data.drop('READMITTED', axis=1), selected_data['READMITTED'], test_size=0.2, random_state=42)

    # Train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Evaluate model
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Extract important features
    importance = np.abs(model.coef_[0])
    feature_names = selected_data.drop('READMITTED', axis=1).columns
    feature_importance = dict(zip(feature_names, importance))

    return train_accuracy, test_accuracy, feature_importance

# Example usage
mimic_data_path = "/path/to/mimic-iii/data"
train_acc, test_acc, important_features = determine_readmissions(mimic_data_path)
print("Train accuracy:", train_acc)
print("Test accuracy:", test_acc)
print("Important features:", important_features)