Now we formally export **all data** that should be used for survival analysis and split them for survival analysis.

In [1]:
import sqlite3
import pandas as pd
import sys

sys.path.append("../../..")

from utils.constants import DatabaseConfig, TableNames

In [2]:
conn = sqlite3.connect(DatabaseConfig.DB_PATH)
cursor = conn.cursor()
primary_key = 'eid'

# Export Data

Here we export data to data frames rather than to csv files. They will be split later.

In [3]:
# Instead of retreiving all data using a large query, we will retreive data for each table separately and then merge them.

# * This exclusion criteria may be modified for sensitivity analysis.
exclusion_criteria = "WHERE s.statins = 0 AND s.ecg_hrv_ok = 1 AND s.ecg_before_cvd == 0"


def retrieve_data(cursor, table_name, primary_key=primary_key, exclusion_criteria=exclusion_criteria):
    cursor.execute(f"PRAGMA table_info({table_name});")
    columns = [row[1] for row in cursor.fetchall()]
    columns = [col for col in columns if col != primary_key]
    query_sql = f"""
    SELECT t.{primary_key}, {', '.join(f't.{col}' for col in columns)} 
    FROM {table_name} t INNER JOIN {TableNames.STATUS} s ON t.eid = s.eid
    {exclusion_criteria};
    """
    cursor.execute(query_sql)
    columns = [description[0] for description in cursor.description]
    data = cursor.fetchall()
    return pd.DataFrame(data, columns=columns)

In [13]:
cursor.execute(f"PRAGMA table_info({TableNames.STATUS});")
status_columns = [row[1] for row in cursor.fetchall()]
status_columns = [col for col in status_columns if col != primary_key]
query_sql = f"""
SELECT {primary_key}, {', '.join(status_columns)} 
FROM {TableNames.STATUS} s
{exclusion_criteria};
"""
cursor.execute(query_sql)
status_columns = [description[0] for description in cursor.description]
status_data = cursor.fetchall()
df_status = pd.DataFrame(status_data, columns=status_columns)
df_status = df_status[['eid', 'event', 'time']]  # We only need these three columns for survival analysis

df_covariates = retrieve_data(cursor, TableNames.COVARIATES)
df_hrv_time = retrieve_data(cursor, TableNames.HRV_TIME)
df_hrv_freq = retrieve_data(cursor, TableNames.HRV_FREQ)
df_hrv_poincare = retrieve_data(cursor, TableNames.HRV_POINCARE)
df_hrv_entropy = retrieve_data(cursor, TableNames.HRV_ENTROPY)
df_hrv_fractal = retrieve_data(cursor, TableNames.HRV_FRACTAL)

In [14]:
print("Status: ", df_status.shape)
print("Covariates: ", df_covariates.shape)
print("HRV Time: ", df_hrv_time.shape)
print("HRV Freq: ", df_hrv_freq.shape)
print("HRV Poincare: ", df_hrv_poincare.shape)
print("HRV Entropy: ", df_hrv_entropy.shape)
print("HRV Fractal: ", df_hrv_fractal.shape)

Status:  (35159, 3)
Covariates:  (35159, 15)
HRV Time:  (35159, 20)
HRV Freq:  (35159, 9)
HRV Poincare:  (35159, 28)
HRV Entropy:  (35159, 8)
HRV Fractal:  (35159, 23)


In [20]:
df_total = df_status.merge(
    df_covariates, on='eid', how='inner'
).merge(
    df_hrv_time, on='eid', how='inner'
).merge(
    df_hrv_freq, on='eid', how='inner'
).merge(
    df_hrv_poincare, on='eid', how='inner'
).merge(
    df_hrv_entropy, on='eid', how='inner'
).merge(
    df_hrv_fractal, on='eid', how='inner'
)
print(df_total.shape)
df_total.head()

(35159, 100)


Unnamed: 0,eid,event,time,age,sex,ethnicity,BMI,smoking,diabetes,systolic_bp,...,HRV_MFDFA_alpha1_Fluctuation,HRV_MFDFA_alpha1_Increment,HRV_MFDFA_alpha2_Width,HRV_MFDFA_alpha2_Peak,HRV_MFDFA_alpha2_Mean,HRV_MFDFA_alpha2_Max,HRV_MFDFA_alpha2_Delta,HRV_MFDFA_alpha2_Asymmetry,HRV_MFDFA_alpha2_Fluctuation,HRV_MFDFA_alpha2_Increment
0,1000205,0,4575,40,1,1,21.5595,0.0,0.0,149.0,...,0.003499,0.226993,,,,,,,,
1,1000239,0,4638,65,0,1,22.9214,1.0,0.0,137.0,...,0.001239,0.127227,,,,,,,,
2,1000677,0,4590,42,0,1,37.892,2.0,0.0,124.0,...,0.001113,0.104668,,,,,,,,
3,1000737,0,4602,52,1,1,22.8374,0.0,0.0,148.0,...,0.001417,0.220395,,,,,,,,
4,1000779,0,4514,56,1,1,25.0194,0.0,0.0,144.0,...,0.003003,0.394044,,,,,,,,


In [23]:
df_total.to_csv("total_data_unimputed.csv")

# Split Data

Now we split the data to training and test sets.

In [24]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df_total, 
    test_size=0.2, 
    random_state=1234,
    stratify=df_total['event']  # Make sure both data have the same proportion of event
)
df_train.to_csv("train_data_unimputed.csv")
df_test.to_csv("test_data_unimputed.csv")