In [22]:
import json
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pickle
from datasets.preprocessing_tabular.tabular_utils import *

IMAGING_SUBJ_PATH = "./datasets/data_files/image_files/recon_cmr_subject_paths_50k.pkl"
BIOMARKER_TABULAR_DIR="Phenotype data you have"
ALL_CARDIAC_FEATURES_PATH = "Cardiac features you have"
CLEANED_FEATURES_PATH = "Clean tabular data you have"
RAW_TABULAR_DATA_PATH = "./datasets/data_files/tabular_files/raw_tab.csv"
RAW_SCALED_TABULAR_DATA_PATH = "./datasets/data_files/tabular_files/raw_scaled_tab.csv"
PREPROCESSED_TABULAR_DATA_PATH = "./datasets/data_files/tabular_files/preprocessed_tab_nonorm_noonehot.csv"

NORMALIZATION = True
ONE_HOT_ENCODE = False

# Save feature names

In [17]:
SAVE_PATH = "datasets/data_files/tabular_files/feature_names.json"
feature_data = {
    "numerical": [
        'Age when attended assessment centre-2.0',
        'Pulse wave Arterial Stiffness index-2.0',
        'Systolic blood pressure-2.mean',
        'Diastolic blood pressure-2.mean',
        'Pulse rate-2.mean',
        'Body fat percentage-2.0',
        'Whole body fat mass-2.0',
        'Whole body fat-free mass-2.0',
        'Whole body water mass-2.0',
        'Body mass index (BMI)-2.0',
        'Cooked vegetable intake-2.0',
        'Salad / raw vegetable intake-2.0',
        'Cardiac operations performed',
        'Total mass-2.0',
        'Basal metabolic rate-2.0',
        'Impedance of whole body-2.0',
        'Waist circumference-2.0',
        'Hip circumference-2.0',
        'Standing height-2.0',
        'Height-2.0',
        'Sitting height-2.0',
        'Weight-2.0',
        'Ventricular rate-2.0',
        'P duration-2.0',
        'QRS duration-2.0',
        'PQ interval-2.0',
        'RR interval-2.0',
        'PP interval-2.0',
        'Cardiac output-2.0',
        'Cardiac index-2.0',
        'Average heart rate-2.0',
        'Body surface area-2.0',
        'Duration of walks-2.0',
        'Duration of moderate activity-2.0',
        'Duration of vigorous activity-2.0',
        'Time spent watching television (TV)-2.0',
        'Time spent using computer-2.0',
        'Time spent driving-2.0',
        'Heart rate during PWA-2.0',
        'Systolic brachial blood pressure during PWA-2.0',
        'Diastolic brachial blood pressure during PWA-2.0',
        'Peripheral pulse pressure during PWA-2.0',
        'Central systolic blood pressure during PWA-2.0',
        'Central pulse pressure during PWA-2.0',
        'Number of beats in waveform average for PWA-2.0',
        'Central augmentation pressure during PWA-2.0',
        'Augmentation index for PWA-2.0',
        'Cardiac output during PWA-2.0',
        'End systolic pressure during PWA-2.0',
        'End systolic pressure index during PWA-2.0',
        'Stroke volume during PWA-2.0',
        'Mean arterial pressure during PWA-2.0',
        'Cardiac index during PWA-2.0',
        'Sleep duration-2.0',
        'Exposure to tobacco smoke at home-2.0',
        'Exposure to tobacco smoke outside home-2.0',
        'Pack years of smoking-2.0',
        'Pack years adult smoking as proportion of life span exposed to smoking-2.0',
        'LVESV (mL)',
        'LVSV (mL)',
        'LVEF (%)',
        'LVCO (L/min)',
        'LVM (g)',
        'RVEDV (mL)',
        'RVESV (mL)',
        'RVSV (mL)',
        'RVEF (%)',
        'LAV max (mL)',
        'LAV min (mL)',
        'LASV (mL)',
        'LAEF (%)',
        'RAV max (mL)',
        'RAV min (mL)',
        'RASV (mL)',
        'RAEF (%)',
    ],
    
    "single_categorical": [
        'Worrier / anxious feelings-2.0',
        'Shortness of breath walking on level ground-2.0',
        'Sex-0.0',
        'Diabetes diagnosis',
        'Heart attack diagnosed by doctor',
        'Angina diagnosed by doctor',
        'Stroke diagnosed by doctor',
        'High blood pressure diagnosed by doctor',
        'Cholesterol lowering medication regularly taken',
        'Blood pressure medication regularly taken',
        'Insulin medication regularly taken',
        'Hormone replacement therapy medication regularly taken',
        'Oral contraceptive pill or minipill medication regularly taken',
        'Pace-maker-2.0',
        'Ever had diabetes (Type I or Type II)-0.0',
        'Long-standing illness, disability or infirmity-2.0',
        'Tense / \'highly strung\'-2.0',
        'Ever smoked-2.0'
        ],


    "multi_categorical": {
        'Sleeplessness / insomnia-2.0': [3, True],
        'Frequency of heavy DIY in last 4 weeks-2.0': [7, False],
        'Alcohol intake frequency.-2.0': [6, True],
        'Processed meat intake-2.0': [6, False],
        'Beef intake-2.0': [6, False],
        'Pork intake-2.0': [6, False],
        'Lamb/mutton intake-2.0': [6, False],
        'Overall health rating-2.0': [4, True],
        'Alcohol usually taken with meals-2.0': [3, False],
        'Alcohol drinker status-2.0': [3, False],
        'Frequency of drinking alcohol-0.0': [5, False],
        'Frequency of consuming six or more units of alcohol-0.0': [5, True],
        'Amount of alcohol drunk on a typical drinking day-0.0': [6, False],
        'Falls in the last year-2.0': [3, True],
        'Weight change compared with 1 year ago-2.0': [3, False],
        'Number of days/week walked 10+ minutes-2.0': [8, False],
        'Number of days/week of moderate physical activity 10+ minutes-2.0': [8, False],
        'Number of days/week of vigorous physical activity 10+ minutes-2.0': [8, False],
        'Usual walking pace-2.0': [3, True],
        'Frequency of stair climbing in last 4 weeks-2.0': [6, False],
        'Frequency of walking for pleasure in last 4 weeks-2.0': [7, False],
        'Duration walking for pleasure-2.0': [8, False],
        'Frequency of strenuous sports in last 4 weeks-2.0': [7, False],
        'Duration of strenuous sports-2.0': [8, False],
        'Duration of light DIY-2.0': [8, False],
        'Duration of heavy DIY-2.0': [8, False],
        'Frequency of other exercises in last 4 weeks-2.0': [7, False],
        'Duration of other exercises-2.0': [8, False],
        'Current tobacco smoking-2.0': [3, False],
        'Past tobacco smoking-2.0': [4, True],
        'Smoking/smokers in household-2.0': [3, False],
        'Smoking status-2.0': [3, False]
    }
}
with open(SAVE_PATH, 'w') as f:
    json.dump(feature_data, f, indent=4)

# Generate clean table based on the selected features

In [6]:
def cardiac_features_to_vector_no_onehot_df(df, feature_data):
    vec = []
    indices = {}
    vec.append(df['eid'])
    
    # Numerical
    for name in feature_data["numerical"]:
        vec.append(df[name])
    indices["numerical"] = list(range(len(vec)))
    
    # Single categorical
    for name in feature_data["single_categorical"]:
        vec.append(df[name].apply(clean_categorical))
    indices["categorical_single"] = list(range((indices["numerical"][-1]+1), len(vec)))

    # Multiple categorical
    for name, data in feature_data["multi_categorical"].items():
        feature_values = df[name].apply(clean_categorical)
        use_base = data[1]
        if use_base and not ONE_HOT_ENCODE:
            feature_values = feature_values.apply(lambda x: x - 1 if pd.notnull(x) else x)
        vec.append(feature_values)
    indices["categorical_multi"] = list(range((indices["categorical_single"][-1]+1), len(vec)))
    return vec, indices

In [7]:
def df_to_one_hot_encode_df(df):
    vec = []
    num_classes = [3, 7, 6, 6, 6, 6, 6, 4, 3, 3, 5, 5, 6, 3, 3, 8, 8, 8, 3, 6, 7, 8, 7, 8, 8, 8, 7, 8, 3, 4, 3, 3]

    vec.append(df['Sleeplessness / insomnia-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
    vec.append(df['Frequency of heavy DIY in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
    vec.append(df['Alcohol intake frequency.-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6, one_based=True)))
    vec.append(df['Processed meat intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
    vec.append(df['Beef intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
    vec.append(df['Pork intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
    vec.append(df['Lamb/mutton intake-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
    vec.append(df['Overall health rating-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=4, one_based=True)))
    vec.append(df['Alcohol usually taken with meals-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    vec.append(df['Alcohol drinker status-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    vec.append(df['Frequency of drinking alcohol-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=5)))
    vec.append(df['Frequency of consuming six or more units of alcohol-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=5, one_based=True)))
    vec.append(df['Amount of alcohol drunk on a typical drinking day-0.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6, one_based=False)))
    vec.append(df['Falls in the last year-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
    vec.append(df['Weight change compared with 1 year ago-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    vec.append(df['Number of days/week walked 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Number of days/week of moderate physical activity 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Number of days/week of vigorous physical activity 10+ minutes-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Usual walking pace-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3, one_based=True)))
    vec.append(df['Frequency of stair climbing in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=6)))
    vec.append(df['Frequency of walking for pleasure in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
    vec.append(df['Duration walking for pleasure-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Frequency of strenuous sports in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
    vec.append(df['Duration of strenuous sports-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Duration of light DIY-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Duration of heavy DIY-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Frequency of other exercises in last 4 weeks-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=7)))
    vec.append(df['Duration of other exercises-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=8)))
    vec.append(df['Current tobacco smoking-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    vec.append(df['Past tobacco smoking-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=4, one_based=True)))
    vec.append(df['Smoking/smokers in household-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    vec.append(df['Smoking status-2.0'].apply(lambda col: one_hot_encode(value=col, num_classes=3)))
    one_hot_df = pd.concat(vec,axis=1)
    one_hot_df = one_hot_df.reset_index(drop=True)
    return one_hot_df, num_classes

In [None]:
raw_data_df = pd.read_csv(CLEANED_FEATURES_PATH)

df_vec, indices = cardiac_features_to_vector_no_onehot_df(raw_data_df, feature_data)
df = pd.concat(df_vec,axis=1)
df = df.reset_index(drop=True)
print(f"{indices.keys()}")
df.to_csv(RAW_TABULAR_DATA_PATH, index=False)

### Impute NaN with mean values / most frequenct values and apply z-score standard deviation
### z-score standard normalization

In [15]:
# Impute numerical missing values with the mean
eid_column = df["eid"]

raw_numerical_df = df.iloc[:, indices["numerical"][1:]]
singlecategorical_df = df.iloc[:, indices["categorical_single"]]
multicategorical_df = df.iloc[:, indices["categorical_multi"]]

imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
imp_mean.fit(raw_numerical_df)
numerical_df = imp_mean.transform(raw_numerical_df)
numerical_df = pd.DataFrame(numerical_df, columns=df.columns[indices["numerical"][1:]])

# z-score standard normalization
if NORMALIZATION:
    scaler = StandardScaler()
    normalized_df = scaler.fit_transform(numerical_df)
    numerical_df = pd.DataFrame(normalized_df, columns=numerical_df.columns)
    # Save the scaler
    with open('data_files/tabular_files/scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)
# Save the scaled version of the raw data. No data imputation!
raw_numerical_df = df.iloc[:, indices["numerical"][1:]]
raw_categorical_df = df.iloc[:, indices["numerical"][1:]]
raw_scaled_numerical_df = numerical_df.where(~raw_numerical_df.isna(), np.nan)
raw_scaled_df = pd.concat([eid_column, raw_scaled_numerical_df, singlecategorical_df, multicategorical_df], axis=1)
raw_scaled_df.to_csv(RAW_SCALED_TABULAR_DATA_PATH, index=False)

In [None]:

# Impute caregorical missing values with the most frequent value
categorical_indices = indices["categorical_single"] + indices["categorical_multi"]
categorical_df = df.iloc[:, categorical_indices]
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy="most_frequent", keep_empty_features=True)
imp_most_freq.fit(categorical_df)
df.iloc[:, categorical_indices] = imp_most_freq.transform(df.iloc[:, categorical_indices])

# One hot encode and expand the multiple categorical columns
if ONE_HOT_ENCODE:
    multi_c_df, num_classes = df_to_one_hot_encode_df(multicategorical_df)
    expanded_vecs = []
    for col_name in multi_c_df.columns:
        expanded_vec = pd.DataFrame(list(multi_c_df[col_name]), 
                                    columns=[f"{col_name}_{i}" for i in range(len(multi_c_df[col_name][0]))])
        expanded_vecs.append(expanded_vec)
    multicategorical_df = pd.concat(expanded_vecs, axis=1) 

# Combine df
preprocessed_df = pd.concat([eid_column, numerical_df, singlecategorical_df, multicategorical_df], axis=1)
preprocessed_df.to_csv(PREPROCESSED_TABULAR_DATA_PATH, index=False)
print(preprocessed_df.shape)