# 04. Merge NGS and Clinical Data

This notebook combines the NGS data of Tat alognside the corresponding clinical data for each patient.

<p><b>Input:</b></p>
<ul>
<li>REDCAP data csv file
<li>Long-form profiles of NGS data, as percentage (tat_freq_profile.csv)
<li>Long-form profiles of NGS data, as raw counts (tat_count_profile.csv)
</ul>
<p><b>Output:</b></p>
<ul>
<li>Merged data files, stratified according to matched neurological testing data (GDS or TMHDS) and sequencing information format (relative abundance or raw counts)
</ul>

# Import requirements and modules

In [1]:
import pandas as pd
import numpy as np

# Import data

In [3]:
seq_abundance_df = pd.read_csv('../data/tat_profiles/tat_freq_profile.csv')
print(seq_abundance_df.shape)

seq_counts_df = pd.read_csv('../data/tat_profiles/tat_count_profile.csv')
print(seq_counts_df.shape)

clinical_df = pd.read_csv('../data/processed_clinical/full_clinical.csv')
print(clinical_df.shape)

(9784, 25)
(9784, 25)
(3055, 52)


# Merge Data

In [9]:
def merge_data_sources(df1, df2):
    merged_df = pd.merge(df1, df2,
                left_on = ['Patient','Visit'],
                right_on = ['Patient','Visit'],
                how = 'inner')
    merged_df.sort_values(['AAPos','Patient','Visit'],  inplace=True)
    return merged_df
    
merged_df1 = merge_data_sources(clinical_df, seq_abundance_df)
print(merged_df1.shape)

merged_df2 = merge_data_sources(clinical_df, seq_counts_df)
print(merged_df2.shape)

(9684, 75)
(9684, 75)


# Drop missing values 

In [12]:
GDSmerged_abundance_df = merged_df1.dropna(axis=0, subset=['GDS'])
TMHDSmerged_abundance_df = merged_df1.dropna(axis=0, subset=['TMHDS'])
TMHDSGDSmerged_abundance_df = TMHDSmerged_abundance_df.dropna(axis=0, subset=['GDS'])
print(GDSmerged_abundance_df.shape)
print(TMHDSmerged_abundance_df.shape)
print(TMHDSGDSmerged_abundance_df.shape)

GDSmerged_counts_df = merged_df2.dropna(axis=0, subset=['GDS'])
TMHDSmerged_counts_df = merged_df2.dropna(axis=0, subset=['TMHDS'])
TMHDSGDSmerged_counts_df = TMHDSmerged_counts_df.dropna(axis=0, subset=['GDS'])
print(GDSmerged_counts_df.shape)
print(TMHDSmerged_counts_df.shape)
print(TMHDSGDSmerged_counts_df.shape)

(9684, 75)
(9354, 75)
(9354, 75)
(9684, 75)
(9354, 75)
(9354, 75)


# Save the merged long-form data

In [14]:
# merged proportional data
GDSmerged_abundance_df.to_csv('../data/merged_data/NGS_GDS_abundance.csv', index=False)
TMHDSmerged_abundance_df.to_csv('../data/merged_data/NGS_TMHDS_abundance.csv', index=False)
TMHDSGDSmerged_abundance_df.to_csv('../data/merged_data/NGS_BOTH_abundance.csv', index=False)

# merged raw counts data
GDSmerged_counts_df.to_csv('../data/merged_data/NGS_GDS_counts.csv', index=False)
TMHDSmerged_counts_df.to_csv('../data/merged_data/NGS_TMHDS_counts.csv', index=False)
TMHDSGDSmerged_counts_df.to_csv('../data/merged_data/NGS_BOTH_counts.csv', index=False)