In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# merge results from PheWAS GWAS Atlas
phewas_results = os.listdir('results/PheWAS/gwas_altas')
for i, file in enumerate(phewas_results):
    if file.startswith('rs'):
        df = pd.read_csv(os.path.join('results/PheWAS/gwas_altas', file))
        rsid = file.split('.')[0]
        df['snp'] = rsid
        if i == 0:
            df_altas = df
        else:
            df_altas = pd.concat([df_altas, df], ignore_index=True)
# sort by domain, trait, and p-value
df_altas = df_altas.sort_values(by=['Domain', 'Trait', 'P-value'], ascending=[True, True, True])

In [None]:
df_altas['Domain'].value_counts()

In [None]:
# merge some domains
# Metabolic, Endocrine, Nutritional -> Metabolic
# Cell, Hematological -> Cell and Hematological
# Activities, Aging, Mortality, Social Interactions -> Lifestyle factor
# Environment, Environmental -> Environment
# Psychiatric, Neurological, Cognitive -> Neuropsychiatric
# Body Structures, Connective Tissue, Skeletal -> Body Structures
# Ophthalmological, "Ear, Nose, Throat", Dermatological -> Sensory systems
df_altas['domain_original'] = df_altas['Domain'].copy()
df_altas['Domain'] = df_altas['domain_original'].replace({
    'Endocrine': 'Metabolic',
    'Nutritional': 'Metabolic',
    'Activities': 'Lifestyle Factor',
    'Aging': 'Lifestyle Factor',
    'Mortality': 'Lifestyle Factor',
    'Social Interactions': 'Lifestyle factor',
    'Environment': 'Environment',
    'Environmental': 'Environment',
    'Psychiatric': 'Neuropsychiatric',
    'Neurological': 'Neuropsychiatric',
    'Cognitive': 'Neuropsychiatric',
    'Body Structures': 'Body Structures',
    'Connective Tissue': 'Body Structures',
    'Skeletal': 'Body Structures',
    "Ophthalmological": "Sensory Systems",
    "Ear, Nose, Throat": "Sensory Systems",
    "Dermatological": "Sensory Systems",
    'Cell': 'Cell and Hematological',
    'Hematological': 'Cell and Hematological',
})
print(df_altas['Domain'].value_counts())
# save the results
df_altas.to_csv('results/PheWAS/gwas_altas/gwas_altas_merged.csv', index=False)
print(len(df_altas))

In [None]:
# remove duplicates and keep the one with the lowest p-value
df_altas_rmdup = df_altas.sort_values(by=['Domain', 'Trait', 'P-value'], ascending=[True, True, True]).drop_duplicates(subset=['Domain', 'Trait'], keep='first')
print(len(df_altas_rmdup))
print(df_altas_rmdup['Domain'].value_counts())
df_altas_rmdup.to_csv('results/PheWAS/gwas_altas/gwas_altas_merged_rmdup.csv', index=False)

In [None]:
df_altas_sig = df_altas[df_altas['P-value'] < 5e-8]
# remove duplicates by same pmid and trait and snp, keep the one with the lowest p-value
df_altas_sig = df_altas_sig.sort_values(by=['Domain', 'Trait', 'P-value'], ascending=[True, True, True]).drop_duplicates(subset=['Domain', 'Trait', 'PMID', 'snp'], keep='first')
# save the results
df_altas_sig.to_csv('results/PheWAS/gwas_altas/gwas_altas_sig.csv', index=False)