- Version: 1.0
- last updated: 2023-12-08
- Short description: This notebook is used to merge the cleaned datasets.

## Install and Import

In [1]:
# Import modules
import warnings
from datetime import date
import pandas as pd

## Notebook Details

In [2]:
# parameters for logging and notebook exports
notebook_name = "02.01_merge_cleaned_datasets"  # only file name without extension

## Configuration

In [3]:
# Pandas
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# Warnings
warnings.filterwarnings("ignore")

In [4]:
# Constants
ROOT_PATH = "../"
PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_LOG = "logs/"
PATH_MOD = "models/"
PATH_REP = "reports/"
PATH_FIG = "reports/figures/"
PATH_HTML = "reports/html/"

## Load Data

In [5]:
def load_data(path):
    return pd.read_parquet(ROOT_PATH + path)

file = "donor_information_cleaned.parquet"
df_di = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)

file = "group_weights_cleaned.parquet"
df_gw = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)

file = "papq_wide_cleaned.parquet"
df_papq_wide = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)

## Main Part

In [6]:
# Merge the datasets on the "name" column
merged_data = pd.merge(df_di, df_gw, on="name", how="inner")

# Display the first few rows of the merged dataset
merged_data.head()

Unnamed: 0,donor_id,name,age,sex,apo_e4_allele,education_years,age_at_first_tbi,longest_loc_duration,cerad,num_tbi_w_loc,dsm_iv_clinical_diagnosis,control_set,nincds_arda_diagnosis,ever_tbi_w_loc,race,hispanic,act_demented,braak,nia_reagan,age_bin,age_clean,age_at_first_tbi_bin,education_years_stages_bin,education_years_quartiles_bin,longest_loc_duration_bin,longest_loc_duration_clean,act_demented_clean,sex_clean,apo_e4_allele_clean,ever_tbi_w_loc_clean,group_weight
0,326765665,H14.09.078,87,M,N,16,0,Unknown or N/A,0,0,No Dementia,31,No Dementia,N,White,Not Hispanic,No Dementia,1,1,87-89,87.0,,16-18 years,14-16 years,Unknown or N/A,0.0,0,1,0,0,47.75
1,326765656,H14.09.069,95-99,M,N,17,12,1-2 min,2,1,No Dementia,26,No Dementia,Y,White,Not Hispanic,No Dementia,5,2,95-99,97.0,early_years (1-30),16-18 years,16-21 years,10 sec - 5 min,90.0,0,1,0,1,11.54
2,326765654,H14.09.067,85,M,Y,10,72,< 10 sec,3,1,Vascular,25,"Dementia, Type Unknown",Y,White,Not Hispanic,Dementia,4,2,81-86,85.0,late_years (61-90),5-11 years,5-12 years,< 10 sec,5.5,1,1,1,1,5.66
3,467056391,H15.09.103,90-94,F,N,11,87,< 10 sec,0,1,No Dementia,52,No Dementia,Y,White,Not Hispanic,No Dementia,4,0,90-94,92.0,late_years (61-90),5-11 years,5-12 years,< 10 sec,5.5,0,0,0,1,17.31
4,309335447,H14.09.010,100+,M,Y,16,0,Unknown or N/A,3,0,Alzheimer's Disease Type,28,Possible Alzheimer'S Disease,N,White,Not Hispanic,Dementia,4,2,100+,101.0,,16-18 years,14-16 years,Unknown or N/A,0.0,1,1,1,0,12.31


In [7]:
merged_data.shape

(107, 31)

In [8]:
# Merge the datasets
merged_data = pd.merge(merged_data, df_papq_wide, on="donor_id", how="inner")

# Display the first few rows of the merged dataset
merged_data.head()

Unnamed: 0,donor_id,name,age,sex,apo_e4_allele,education_years,age_at_first_tbi,longest_loc_duration,cerad,num_tbi_w_loc,dsm_iv_clinical_diagnosis,control_set,nincds_arda_diagnosis,ever_tbi_w_loc,race,hispanic,act_demented,braak,nia_reagan,age_bin,age_clean,age_at_first_tbi_bin,education_years_stages_bin,education_years_quartiles_bin,longest_loc_duration_bin,longest_loc_duration_clean,act_demented_clean,sex_clean,apo_e4_allele_clean,ever_tbi_w_loc_clean,group_weight,ihc_a_syn_FWM,ihc_a_syn_HIP,ihc_a_syn_PCx,ihc_a_syn_TCx,ihc_tau2_ffpe_FWM,ihc_tau2_ffpe_HIP,ihc_tau2_ffpe_PCx,ihc_tau2_ffpe_TCx,ihc_at8_ffpe_FWM,ihc_at8_ffpe_HIP,ihc_at8_ffpe_PCx,ihc_at8_ffpe_TCx,ihc_at8_FWM,ihc_at8_HIP,ihc_at8_PCx,ihc_at8_TCx,ihc_ptdp_43_ffpe_FWM,ihc_ptdp_43_ffpe_HIP,ihc_ptdp_43_ffpe_PCx,ihc_ptdp_43_ffpe_TCx,ihc_a_beta_ffpe_FWM,ihc_a_beta_ffpe_HIP,ihc_a_beta_ffpe_PCx,ihc_a_beta_ffpe_TCx,ihc_a_beta_FWM,ihc_a_beta_HIP,ihc_a_beta_PCx,ihc_a_beta_TCx,ihc_iba1_ffpe_FWM,ihc_iba1_ffpe_HIP,ihc_iba1_ffpe_PCx,ihc_iba1_ffpe_TCx,ihc_gfap_ffpe_FWM,ihc_gfap_ffpe_HIP,ihc_gfap_ffpe_PCx,ihc_gfap_ffpe_TCx,ptau_ng_per_mg_FWM,ptau_ng_per_mg_HIP,ptau_ng_per_mg_PCx,ptau_ng_per_mg_TCx,vegf_pg_per_mg_FWM,vegf_pg_per_mg_HIP,vegf_pg_per_mg_PCx,vegf_pg_per_mg_TCx,ab42_over_ab40_ratio_FWM,ab42_over_ab40_ratio_HIP,ab42_over_ab40_ratio_PCx,ab42_over_ab40_ratio_TCx,tnf_a_pg_per_mg_FWM,tnf_a_pg_per_mg_HIP,tnf_a_pg_per_mg_PCx,tnf_a_pg_per_mg_TCx,tau_ng_per_mg_FWM,tau_ng_per_mg_HIP,tau_ng_per_mg_PCx,tau_ng_per_mg_TCx,il_10_pg_per_mg_FWM,il_10_pg_per_mg_HIP,il_10_pg_per_mg_PCx,il_10_pg_per_mg_TCx,isoprostane_pg_per_mg_FWM,isoprostane_pg_per_mg_HIP,isoprostane_pg_per_mg_PCx,isoprostane_pg_per_mg_TCx,il_6_pg_per_mg_FWM,il_6_pg_per_mg_HIP,il_6_pg_per_mg_PCx,il_6_pg_per_mg_TCx,il_1b_pg_per_mg_FWM,il_1b_pg_per_mg_HIP,il_1b_pg_per_mg_PCx,il_1b_pg_per_mg_TCx,ptau_over_tau_ratio_FWM,ptau_over_tau_ratio_HIP,ptau_over_tau_ratio_PCx,ptau_over_tau_ratio_TCx,il_4_pg_per_mg_FWM,il_4_pg_per_mg_HIP,il_4_pg_per_mg_PCx,il_4_pg_per_mg_TCx,rantes_pg_per_mg_FWM,rantes_pg_per_mg_HIP,rantes_pg_per_mg_PCx,rantes_pg_per_mg_TCx,ab40_pg_per_mg_FWM,ab40_pg_per_mg_HIP,ab40_pg_per_mg_PCx,ab40_pg_per_mg_TCx,a_syn_pg_per_mg_FWM,a_syn_pg_per_mg_HIP,a_syn_pg_per_mg_PCx,a_syn_pg_per_mg_TCx,ifn_g_pg_per_mg_FWM,ifn_g_pg_per_mg_HIP,ifn_g_pg_per_mg_PCx,ifn_g_pg_per_mg_TCx,mcp_1_pg_per_mg_FWM,mcp_1_pg_per_mg_HIP,mcp_1_pg_per_mg_PCx,mcp_1_pg_per_mg_TCx,bdnf_pg_per_mg_FWM,bdnf_pg_per_mg_HIP,bdnf_pg_per_mg_PCx,bdnf_pg_per_mg_TCx,mip_1a_pg_per_mg_FWM,mip_1a_pg_per_mg_HIP,mip_1a_pg_per_mg_PCx,mip_1a_pg_per_mg_TCx,il_7_pg_per_mg_FWM,il_7_pg_per_mg_HIP,il_7_pg_per_mg_PCx,il_7_pg_per_mg_TCx,ab42_pg_per_mg_FWM,ab42_pg_per_mg_HIP,ab42_pg_per_mg_PCx,ab42_pg_per_mg_TCx,total_scan
0,326765665,H14.09.078,87,M,N,16,0,Unknown or N/A,0,0,No Dementia,31,No Dementia,N,White,Not Hispanic,No Dementia,1,1,87-89,87.0,,16-18 years,14-16 years,Unknown or N/A,0.0,0,1,0,0,47.75,-1.0,4.6e-05,-1.0,5.6e-05,-1.0,0.003905,-1.0,0.003677,-1.0,0.013143,-1.0,0.001854,-1.0,0.012439,-1.0,6.1e-05,-1.0,0.001035,-1.0,0.001987,-1.0,0.008041,-1.0,0.006379,-1.0,0.007813,-1.0,0.00181,-1.0,0.045469,-1.0,0.025448,-1.0,0.064207,-1.0,-1.0,-1.0,0.746922,-1.0,0.121403,-1.0,0.0,-1.0,1.64,-1.0,0.182779,-1.0,27.658229,-1.0,8.1,-1.0,0.8,-1.0,1.28179,-1.0,1.66869,-1.0,0.0,-1.0,0.12,-1.0,-1.0,-1.0,2.664223,-1.0,74.04,-1.0,0.94,-1.0,0.24,-1.0,0.0,-1.0,0.582718,-1.0,0.072753,-1.0,0.0,-1.0,1.6,-1.0,12.78,-1.0,81.88,-1.0,0.545775,-1.0,0.157781,-1.0,0.296666,-1.0,0.18,-1.0,1.94,-1.0,0.84,-1.0,0.58,-1.0,18.68,-1.0,4.03619,-1.0,0.18,-1.0,3.7,-1.0,15.38,-1.0,0.6,-1.0,11.0,-1.0,0.099756,-1.0,4.36395,2
1,326765656,H14.09.069,95-99,M,N,17,12,1-2 min,2,1,No Dementia,26,No Dementia,Y,White,Not Hispanic,No Dementia,5,2,95-99,97.0,early_years (1-30),16-18 years,16-21 years,10 sec - 5 min,90.0,0,1,0,1,11.54,3.4e-05,7.1e-05,6.3e-05,9.3e-05,0.001563,0.01144,0.003407,0.006103,0.005634,0.036654,0.017135,0.026569,0.002274,0.056922,0.014112,0.009599,0.001702,0.000888,0.001886,0.001137,0.00306,0.010088,0.033776,0.03033,0.012777,0.016031,0.061333,0.055102,0.055185,0.049652,0.047596,0.050715,0.089,0.048178,0.046832,0.041647,1.395498,4.712879,1.719367,1.355935,1.24,0.76,1.18,2.08,394.006795,342.148288,230.613801,211.125225,0.0,13.08,2.7,1.14,1.088977,1.40592,1.049501,1.467668,0.0,0.66,0.18,0.32,-1.0,-1.0,2.511954,-1.0,4.83,22.88,3.12,2.7,0.0,0.28,0.0,0.0,1.281475,3.352169,1.63827,0.92387,3.37,0.0,0.3,0.0,7.02,10.14,13.38,19.1,1.333781,1.350075,2.732625,2.1658,0.067883,0.467025,0.086297,0.57,0.95,0.0,0.28,0.06,13.75,2.5,12.1,30.94,2.867955,4.79779,8.21508,0.57,0.0,39.32,6.18,5.32,3.93,0.3,29.52,23.9,525.518875,461.92585,630.181038,457.255013,4
2,326765654,H14.09.067,85,M,Y,10,72,< 10 sec,3,1,Vascular,25,"Dementia, Type Unknown",Y,White,Not Hispanic,Dementia,4,2,81-86,85.0,late_years (61-90),5-11 years,5-12 years,< 10 sec,5.5,1,1,1,1,5.66,0.00044,0.00193,0.001507,0.003973,0.001519,0.010791,0.002416,0.003542,0.001692,0.019839,0.001893,0.021183,4.7e-05,0.038885,0.000161,0.000189,0.000969,0.001444,0.001825,0.001821,0.003821,0.014584,0.016357,0.022213,0.01,0.016216,0.066307,0.06581,0.050263,0.075951,0.049983,0.048856,0.032765,0.029312,0.025223,0.022923,0.332819,1.816172,0.734323,0.672854,3.42,0.5,1.36,2.48,869.786538,129.829985,398.561763,169.806888,0.0,11.26,2.68,0.0,0.904703,1.139455,1.014441,1.569319,0.46,2.48,0.46,1.2,-1.0,-1.0,2.068608,2.760603,0.5,17.6,0.94,0.1,0.0,0.1,0.0,0.0,0.367877,1.593896,0.72387,0.428755,0.52,0.0,3.02,0.0,12.3,7.58,12.32,15.28,0.351,1.918287,1.250438,2.8665,0.060729,0.426804,0.135326,0.17,0.0,0.0,0.02,0.0,40.4,4.54,29.3,20.86,3.50782,5.06069,3.86952,0.17,5.62,49.36,6.14,4.28,34.48,0.04,38.06,19.08,305.295075,249.051238,498.376575,486.751444,4
3,467056391,H15.09.103,90-94,F,N,11,87,< 10 sec,0,1,No Dementia,52,No Dementia,Y,White,Not Hispanic,No Dementia,4,0,90-94,92.0,late_years (61-90),5-11 years,5-12 years,< 10 sec,5.5,0,0,0,1,17.31,0.000132,5.6e-05,3.7e-05,4.6e-05,0.00077,0.002497,0.000831,0.000965,0.001412,0.026111,0.00149,0.00376,9.5e-05,0.01191,7.6e-05,5.4e-05,0.001028,0.00173,0.000984,0.000881,0.003223,0.009737,0.00695,0.011834,0.000432,0.002754,0.00926,0.009037,0.081018,0.05866,0.065651,0.086925,0.030904,0.064947,0.016388,0.022866,1.624779,2.540756,2.397251,2.26296,0.68,1.36,0.68,1.87,0.44503,1.525649,1.307275,0.479263,0.5,28.05,0.0,1.62,1.591404,1.483437,1.512623,1.580459,0.42,0.78,0.0,3.19,-1.0,-1.0,2.674447,4.945019,0.64,164.21,0.22,1.15,0.0,0.0,0.0,0.0,1.020972,1.71275,1.584831,1.431837,0.3,0.0,0.3,2.86,34.62,17.16,43.04,62.78,0.37975,0.246837,0.3038,0.493675,0.024101,0.075057,0.037077,0.024418,0.28,0.67,0.0,1.64,5.76,5.15,2.2,17.59,3.472415,37.540205,7.85177,0.024418,7.54,21.59,10.56,14.78,17.42,0.0,17.66,15.0,0.169,0.376588,0.39715,0.2366,4
4,309335447,H14.09.010,100+,M,Y,16,0,Unknown or N/A,3,0,Alzheimer's Disease Type,28,Possible Alzheimer'S Disease,N,White,Not Hispanic,Dementia,4,2,100+,101.0,,16-18 years,14-16 years,Unknown or N/A,0.0,1,1,1,0,12.31,6.1e-05,9.9e-05,6.1e-05,6.2e-05,0.003454,0.005172,0.00501,0.004301,0.003673,0.016716,0.004296,0.023869,0.047266,0.050732,0.003227,0.001401,0.002526,0.003779,0.003384,0.002607,0.004146,0.005239,0.014006,0.032423,0.008276,0.013382,0.041812,0.032829,0.030934,0.055457,0.032725,0.057374,0.024208,0.051301,0.032866,0.030665,0.345002,2.243292,0.706289,0.703624,2.14,0.0,3.64,2.08,1000.0,205.671818,556.536067,290.443666,0.0,8.7,0.3,1.98,0.98069,1.347626,1.192077,1.33177,0.34,0.38,0.26,0.0,-1.0,-1.0,3.712133,2.345858,0.68,20.84,0.62,0.32,0.0,0.1,0.0,0.0,0.351795,1.664625,0.592486,0.528337,0.0,0.0,1.0,0.0,12.44,6.72,31.98,18.8,0.222125,2.12565,0.97735,1.2649,0.105482,0.213645,0.090082,0.09,0.0,0.0,0.08,0.06,11.94,1.28,17.06,18.78,4.25096,4.03619,6.89792,0.09,6.36,11.14,10.7,4.04,15.54,0.16,15.42,14.82,284.863688,437.1863,543.930525,367.382194,4


In [9]:
merged_data.shape

(107, 148)

## Exports

In [10]:
# Export
file = "features_merged"
merged_data.to_csv(ROOT_PATH + PATH_DATA_INT + f"{file}.csv")
merged_data.to_parquet(ROOT_PATH + PATH_DATA_INT + f"{file}.parquet")

## Watermark

In [11]:
%load_ext watermark

In [12]:
%watermark

Last updated: 2023-12-08T16:01:47.514024+01:00

Python implementation: CPython
Python version       : 3.9.16
IPython version      : 8.16.1

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit



In [13]:
%watermark --iversions

pandas: 2.1.1



-----


## Snapshot

In [14]:
today = date.today()
output_file = f"{ROOT_PATH}{PATH_HTML}{today}_{notebook_name}.html"
input_file = f"{notebook_name}.ipynb"
print(input_file)
!jupyter nbconvert --to html {input_file} --output {output_file}

02.01_merge_cleaned_datasets.ipynb


[NbConvertApp] Converting notebook 02.01_merge_cleaned_datasets.ipynb to html
[NbConvertApp] Writing 312161 bytes to ../reports/html/2023-12-08_02.01_merge_cleaned_datasets.html
