In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import time

In [None]:
df_cvd = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/CVD/CVD.csv')
df_cvd_times = pd.read_csv('/Users/natsumikyouno/UKBiobank/diseases/CVD/CVD_times.csv')
cvd_names = df_cvd.columns.values[2:]
print(cvd_names)

In [None]:
df_assessment_time = pd.read_csv('/Users/natsumikyouno/UKBiobank/time_assessment_centre.csv')
df_assessment_time = df_assessment_time[['Eid', '53-2.0']].rename(columns={'53-2.0':'time_assessment'})
# convert to datetime to timestamp
df_assessment_time['time_assessment'] = pd.to_datetime(df_assessment_time['time_assessment'], format='%d/%m/%Y')
# to timestamp in seconds
df_assessment_time['time_assessment'] = df_assessment_time['time_assessment'].astype(np.int64) // 10**9
# negative values are not possible as NaN
df_assessment_time['time_assessment'] = np.where(df_assessment_time['time_assessment'] < 0, np.nan, df_assessment_time['time_assessment'])

In [None]:
# concat with df_cvd_times
df_cvd_times = df_assessment_time.merge(df_cvd_times, on='Eid')
df_cvd_times = df_cvd_times[~df_cvd_times['time_assessment'].isna()]
# zero as NA
df_cvd_times = df_cvd_times.replace(0, np.nan)

In [None]:
# time difference for time assessment and event
time_assessment = df_cvd_times['time_assessment'].values
time_cvds = df_cvd_times.iloc[:, 3:].values
time_diff = time_cvds - time_assessment.reshape(-1, 1)

In [None]:
time_first_cvd = [] # find the minimum positive time difference and the index
cvd_index = []

time_last_cvd = [] # last event time before assessment
last_cvd_index = []

for i in range(len(time_diff)):
    # find the minimum positive time difference
    if np.all(np.isnan(time_diff[i])):
        time_first_cvd.append(np.nan)
        cvd_index.append(np.nan)
        time_last_cvd.append(np.nan)
        last_cvd_index.append(np.nan)
    else:
        if np.any(time_diff[i] > 0): # first cvd event after assessment
            min_time = np.nanmin(time_diff[i][time_diff[i] > 0])
            time_first_cvd.append(min_time)
            cvd_index.append(cvd_names[np.where(time_diff[i] == min_time)[0][0]])
        else:
            time_first_cvd.append(np.nan)
            cvd_index.append(np.nan)
        if np.any(time_diff[i] <= 0): # last cvd event before assessment
            max_time = np.nanmax(time_diff[i][time_diff[i] <= 0])
            time_last_cvd.append(max_time)
            last_cvd_index.append(cvd_names[np.where(time_diff[i] == max_time)[0][0]])
        else:
            time_last_cvd.append(np.nan)
            last_cvd_index.append(np.nan)

In [None]:
# censored time as 2023-05-31
time_censored = time.strptime('2023-05-31', '%Y-%m-%d')
# to timestamp in seconds
time_censored = time.mktime(time_censored)
time_to_censored = time_censored - time_assessment.reshape(-1, 1)

In [None]:
df_cvd_events = pd.DataFrame({
    'Eid': df_cvd_times['Eid'].values,
    'time': time_first_cvd,
    'cvd': cvd_index,
    'event': np.where(np.array(time_first_cvd) > 0, 1, 0),
    'time_censored': time_to_censored[:, 0],
    'last_cvd': last_cvd_index,
    'last_time': time_last_cvd,
    'last_event': np.where(np.array(time_last_cvd) < 0, 1, 0)
})
df_cvd_events['time'] = np.where(df_cvd_events['time'] > 0, df_cvd_events['time'], df_cvd_events['time_censored'])

In [None]:
# convert to years
df_cvd_events['time'] = df_cvd_events['time'] / (60 * 60 * 24 * 365.25)
df_cvd_events['time_censored'] = df_cvd_events['time_censored'] / (60 * 60 * 24 * 365.25)
df_cvd_events['last_time'] = df_cvd_events['last_time'] / (60 * 60 * 24 * 365.25)

In [None]:
# save
df_cvd_events.to_csv('data/CVD_events.csv', index=False)

In [None]:
df_cvd_events['event'].value_counts()

In [None]:
df_cvd_events['cvd'].value_counts()