In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, precision_score, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

In [2]:
# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL.csv')
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL.csv')


# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data.csv')
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data.csv')

# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_pre_CL.csv')
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_pre_CL.csv')

# test_df = pd.read_csv('embeds_test_ECG_latents_w_pred_vars_emb_size_576_CL.csv')

train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data_ALL_pheno2.csv')
val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data_ALL_pheno2.csv')

X_train = train_df.drop(columns=['f.eid', 'ID', 'age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num',
                                'LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick'])

y_train = train_df[['age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num']]

X_val = val_df.drop(columns=['f.eid', 'ID', 'age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num',
                             'LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick',])

y_val = val_df[['age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num']]

X_test = val_df.drop(columns=['f.eid', 'ID', 'age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num',
                             'LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick',])

y_test = val_df[['age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num']]


In [3]:
y_train.head(10)

Unnamed: 0,age_imaging_visit,sex,QRS_duration,QRS_num
0,75.0,0,128.0,9.0
1,61.0,0,94.0,11.0
2,56.0,1,98.0,8.0
3,63.0,1,78.0,11.0
4,78.0,1,80.0,9.0
5,61.0,1,100.0,
6,75.0,1,154.0,
7,71.0,1,76.0,
8,75.0,0,78.0,9.0
9,54.0,1,82.0,12.0


In [4]:
print(X_train.shape)

train_df.head()

(11996, 576)


Unnamed: 0,ID,0,1,2,3,4,5,6,7,8,...,575,f.eid,sex,QRS_duration,age_imaging_visit,QRS_num,LV_diast_vol,LV_myoc_mass,RV_diast_vol,LV_myoc_thick
0,2871723,-0.089875,0.451444,-0.107884,-0.247353,-0.147103,0.379006,-0.001708,-0.295991,-0.764888,...,0.118373,2871723,0,128.0,75.0,9.0,,,,
1,4515271,0.356616,0.638387,0.201613,-0.730499,-0.571195,0.916316,-0.202354,-0.149177,-0.276252,...,-0.144028,4515271,0,94.0,61.0,11.0,126.864,69.4247,154.333,4.97915
2,1441342,0.152269,0.599168,0.152148,-0.769117,-0.422459,0.242664,0.022787,-0.25686,-0.050047,...,0.012571,1441342,1,98.0,56.0,8.0,192.482,105.556,219.384,5.89455
3,5613370,0.332433,0.042543,0.078552,-0.590885,-0.188884,0.771281,0.114562,0.094776,-0.119752,...,0.231099,5613370,1,78.0,63.0,11.0,131.303,101.211,141.049,6.86871
4,3588634,0.271653,0.439848,0.130819,-0.608746,-0.632132,0.489525,0.169388,0.047047,-0.061784,...,0.43835,3588634,1,80.0,78.0,9.0,,,,


In [5]:
print("y train null values")
print(y_train.isnull().sum(), "\n")

print("y test null values")
print(y_test.isnull().sum())


y train null values
age_imaging_visit      91
sex                     0
QRS_duration            3
QRS_num              3866
dtype: int64 

y test null values
age_imaging_visit     132
sex                     0
QRS_duration            3
QRS_num              5125
dtype: int64


In [6]:
print(X_train.isnull().sum())


0      0
1      0
2      0
3      0
4      0
      ..
571    0
572    0
573    0
574    0
575    0
Length: 576, dtype: int64


In [7]:
y_train.loc[:, 'age_imaging_visit'] = y_train['age_imaging_visit'].fillna(y_train['age_imaging_visit'].mean())
y_test.loc[:, 'age_imaging_visit'] = y_test['age_imaging_visit'].fillna(y_test['age_imaging_visit'].mean())

reg = LinearRegression().fit(X_train, y_train['age_imaging_visit'])
preds = reg.predict(X_test)

print("Root mean squared error for the prediction of Age given ECG embeds:")
print(mean_squared_error(preds, y_test['age_imaging_visit']))
print("Mean QRS duration:", y_test['age_imaging_visit'].mean())
print("Std QRS duration:", y_test['age_imaging_visit'].std())

print("R^2 for the prediction of Age given ECG embeds:")
print(r2_score(preds, y_test['age_imaging_visit']))


Root mean squared error for the prediction of Age given ECG embeds:
44.23421609887496
Mean QRS duration: 65.40849776208786
Std QRS duration: 7.824673665757267
R^2 for the prediction of Age given ECG embeds:
-1.4342849500929646


In [8]:
y_train.loc[:, 'age_imaging_visit'] = y_train['age_imaging_visit'].fillna(y_train['age_imaging_visit'].mean())
y_val.loc[:, 'age_imaging_visit'] = y_train['age_imaging_visit'].fillna(y_train['age_imaging_visit'].mean())
y_test.loc[:, 'age_imaging_visit'] = y_test['age_imaging_visit'].fillna(y_test['age_imaging_visit'].mean())

# regr = MLPRegressor(hidden_layer_sizes=(200, 100, 30), max_iter=200, tol=0.1, random_state=0,
#                    warm_start=5, early_stopping=False)

regr = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 10), max_iter=1000, tol=1e-4, random_state=0,
                    warm_start=False, early_stopping=False, n_iter_no_change=100)
regr.fit(X_train, y_train['age_imaging_visit'])
preds = regr.predict(X_test)

print("Root mean squared error for the prediction of Age given ECG embeds:")
print(sqrt(mean_squared_error(preds, y_test['age_imaging_visit'])))
print("Mean Age duration:", y_test['age_imaging_visit'].mean())
print("Std Age duration:", y_test['age_imaging_visit'].std())

print("R^2 for the prediction of Age given ECG embeds:")
print(r2_score(preds, y_test['age_imaging_visit']))

Root mean squared error for the prediction of Age given ECG embeds:
4.7667452938909936
Mean Age duration: 65.40849776208786
Std Age duration: 7.824673665757267
R^2 for the prediction of Age given ECG embeds:
0.6261756510575587


In [9]:
# regr = RandomForestRegressor(max_depth=100, random_state=0)

# regr.fit(X_train, y_train['age_imaging_visit'])

# preds = regr.predict(X_test)

# print("Root mean squared error for the prediction of Age given ECG embeds:")
# print(sqrt(mean_squared_error(preds, y_test['age_imaging_visit'])))
# print("Mean QRS duration:", y_test['age_imaging_visit'].mean())
# print("Std QRS duration:", y_test['age_imaging_visit'].std())

# print("R^2 for the prediction of Age given ECG embeds:")
# print(r2_score(preds, y_test['age_imaging_visit']))

In [10]:
y_train.loc[:, 'QRS_duration'] = y_train['QRS_duration'].fillna(y_train['QRS_duration'].mean())
y_test.loc[:, 'QRS_duration'] = y_test['QRS_duration'].fillna(y_test['QRS_duration'].mean())

regr = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 10), max_iter=1000, tol=1e-4, random_state=0,
                    warm_start=False, early_stopping=False, n_iter_no_change=100)
regr.fit(X_train, y_train['QRS_duration'])

preds = regr.predict(X_test)

print("Root mean squared error for the prediction of QRS duration given ECG embeds:")
print(sqrt(mean_squared_error(preds, y_test['QRS_duration'])))
print("Mean QRS duration:", y_test['QRS_duration'].mean())
print("Std QRS duration:", y_test['QRS_duration'].std())

print("R^2 for the prediction of QRS duration given ECG embeds:")
print(r2_score(preds, y_test['QRS_duration']))

Root mean squared error for the prediction of QRS duration given ECG embeds:
4.802911383006246
Mean QRS duration: 88.36630815407703
Std QRS duration: 14.768375833986578
R^2 for the prediction of QRS duration given ECG embeds:
0.8954128235027152




In [11]:
y_train.loc[:, 'QRS_num'] = y_train['QRS_num'].fillna(y_train['QRS_num'].mean())
y_test.loc[:, 'QRS_num'] = y_test['QRS_num'].fillna(y_test['QRS_num'].mean())

regr = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 10), max_iter=1000, tol=1e-4, random_state=0,
                    warm_start=False, early_stopping=False, n_iter_no_change=100)
regr.fit(X_train, y_train['QRS_num'])
preds = regr.predict(X_test)

print("Root mean squared error for the prediction of QRS num given ECG embeds:")
print(sqrt(mean_squared_error(preds, y_test['QRS_num'])))
print("Mean QRS number:", y_test['QRS_num'].mean())
print("Std QRS number:", y_test['QRS_num'].std())

print("\nR^2 for the prediction of QRS num given ECG embeds:")
print(r2_score(preds, y_test['QRS_num']))

Root mean squared error for the prediction of QRS num given ECG embeds:
0.6215910166726276
Mean QRS number: 10.11508739650414
Std QRS number: 1.5113059323954954

R^2 for the prediction of QRS num given ECG embeds:
0.8244351364041665


In [12]:
reg = LogisticRegression(max_iter=1500).fit(X_train, y_train['sex'])
preds = reg.predict(X_test)

regr = MLPClassifier(hidden_layer_sizes=(200, 100, 50, 10), max_iter=1000, tol=1e-4, random_state=0,
                    warm_start=False, early_stopping=False, n_iter_no_change=100)
regr.fit(X_train, y_train['sex'])
preds = regr.predict(X_test)

# print("ROC AUC for the prediction of sex given ECG embeds:")
# print(roc_auc_score(preds, y_test['sex']))

print("Perc female:", 1 - (y_test['sex'].sum() / len(y_test['sex'])))


print("Precision score for the prediction of sex given ECG embeds:")
print(precision_score(y_test['sex'], preds))

Perc female: 0.5129728040012504
Precision score for the prediction of sex given ECG embeds:
0.9623248039089624


Spatial Features

In [13]:
# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_pre_CL.csv').dropna()
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_pre_CL.csv').dropna()

# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data.csv').dropna()
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data.csv').dropna()

# train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL.csv').dropna()
# val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL.csv').dropna()

train_df = pd.read_csv('embeds_train_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data_ALL_pheno2.csv').dropna()
val_df = pd.read_csv('embeds_val_ECG_latents_w_pred_vars_emb_size_576_CL_3D_data_ALL_pheno2.csv').dropna()

# TODO: change to CL
X_train = train_df.drop(columns=['LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick',
                                'f.eid', 'ID', 'age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num'])

y_train = train_df[['LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick']]

X_test = val_df.drop(columns=['LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick',
                            'f.eid', 'ID', 'age_imaging_visit', 'sex', 'QRS_duration', 'QRS_num'])

y_test = val_df[['LV_diast_vol', 'LV_myoc_mass', 'RV_diast_vol', 'LV_myoc_thick']]

In [14]:
def spatial_feature_eval(feature):
    print(f"Mean {feature}:", y_test[feature].mean())
    print(f"Std {feature}:", y_test[feature].std())
    y_train.loc[:, feature] = y_train[feature].fillna(y_train[feature].mean())
    y_test.loc[:, feature] = y_test[feature].fillna(y_test[feature].mean())

    # regr = RandomForestRegressor(max_depth=100, random_state=0)

    # regr.fit(X_train, y_train[feature])

    # preds = regr.predict(X_test)

    regr = MLPRegressor(hidden_layer_sizes=(200, 100, 50, 10), max_iter=1000, tol=1e-4, random_state=0,
                    warm_start=False, early_stopping=False, n_iter_no_change=100)
    regr.fit(X_train, y_train[feature])
    preds = regr.predict(X_test)

    print(f"\nRoot mean squared error for the prediction of {feature} given ECG embeds:")
    print(sqrt(mean_squared_error(preds, y_test[feature])))

    print(f"R^2 for the prediction of {feature} given ECG embeds:")
    print(r2_score(preds, y_test[feature]))


In [15]:
spatial_feature_eval("LV_diast_vol")

Mean LV_diast_vol: 147.73510185983494
Std LV_diast_vol: 34.28898729375779



Root mean squared error for the prediction of LV_diast_vol given ECG embeds:
16.92875713750101
R^2 for the prediction of LV_diast_vol given ECG embeds:
0.7526543439496789


In [16]:
spatial_feature_eval("LV_myoc_mass")

Mean LV_myoc_mass: 86.0057822638256
Std LV_myoc_mass: 22.44655196815607

Root mean squared error for the prediction of LV_myoc_mass given ECG embeds:
9.839774091767236
R^2 for the prediction of LV_myoc_mass given ECG embeds:
0.8003871213441318


In [17]:
spatial_feature_eval("RV_diast_vol")

Mean RV_diast_vol: 156.58841402882132
Std RV_diast_vol: 37.27921346888898

Root mean squared error for the prediction of RV_diast_vol given ECG embeds:
18.49358543209988
R^2 for the prediction of RV_diast_vol given ECG embeds:
0.7489395254108975


In [18]:
spatial_feature_eval("LV_myoc_thick")

Mean LV_myoc_thick: 5.693367071067865
Std LV_myoc_thick: 0.7750010119174464

Root mean squared error for the prediction of LV_myoc_thick given ECG embeds:
0.3549033720183608
R^2 for the prediction of LV_myoc_thick given ECG embeds:
0.7813184742408726
