# Perform the METAL meta-analysis

In this notebook we perform the meta-analysis of aggregated *All of Us* and UK Biobank GWAS results.

Note that this work is part of a larger project to [Demonstrate the Potential for Pooled Analysis of All of Us and UK Biobank Genomic Data](https://github.com/all-of-us/ukb-cross-analysis-demo-project). Specifically this is for the portion of the project that is the meta-analysis of **siloed** gwas results.

# Setup

<div class="alert alert-block alert-warning">
    <b>Cloud Environment</b>: This notebook was written for use on the <i>All of Us</i> Workbench.
    <ul>
        <li>Use "Recommended Environment" <kbd><b>General Analysis</b></kbd> which creates compute type <kbd><b>Standard VM</b></kbd> with reasonable defaults for CPU, RAM, and disk.</li>
        <li>This notebook takes several minutes to run interactively. You can also it in the background via <kbd>run_notebook_in_the_background</kbd> for the sake of provenance and reproducibility.</li>
    </ul>
</div>

In [None]:
from datetime import datetime
import os
import pandas as pd
import time

In [None]:
start = datetime.now()
print(start)

## Install METAL

In [None]:
%%bash

# Install METAL if it is not already installed.
if [ ! -f ./generic-metal/metal ] ; then

    curl -L -o metal.tar.gz "http://csg.sph.umich.edu/abecasis/Metal/download/Linux-metal.tar.gz"
    tar -xf metal.tar.gz

fi

## Define Constants

In [None]:
# Papermill parameters. See https://papermill.readthedocs.io/en/latest/usage-parameterize.html

#---[ Inputs ]---
AOU_GWAS_RESULTS = {
    'HDL': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/aou/regenie/20230403/aou_alpha3_lipids_regenie_step2_HDL_norm.regenie',
    'LDL': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/aou/regenie/20230403/aou_alpha3_lipids_regenie_step2_LDL_adjusted_norm.regenie',
    'TC': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/aou/regenie/20230403/aou_alpha3_lipids_regenie_step2_TC_adjusted_norm.regenie',
    'TG': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/aou/regenie/20230403/aou_alpha3_lipids_regenie_step2_TG_adjusted_norm.regenie'
}

UKB_GWAS_RESULTS = {
    'HDL': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/ukb/regenie/20230403/ukb_200kwes_lipids_regenie_step2_HDL_mg_dl_norm.regenie',
    'LDL': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/ukb/regenie/20230403/ukb_200kwes_lipids_regenie_step2_LDL_adj_mg_dl_norm.regenie',
    'TC': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/ukb/regenie/20230403/ukb_200kwes_lipids_regenie_step2_TC_adj_mg_dl_norm.regenie',
    'TG': 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/ukb/regenie/20230403/ukb_200kwes_lipids_regenie_step2_TG_log_mg_dl_norm.regenie'
}

METAL_RESULTS_BEFORE_INDEL_ID_FIXES = 'gs://fc-secure-e5c31994-13bb-4e3e-b7d7-db6effadc54f/data/metaanalysis/20230404/METAANALYSIS_LDL_1.tbl'

#---[ Outputs ]---
# Create a timestamp for a folder of results generated today.
DATESTAMP = time.strftime('%Y%m%d')
METAL_OUTPUTS = f'{os.getenv("WORKSPACE_BUCKET")}/data/metaanalysis/{DATESTAMP}/'

## Transfer inputs to local disk

In [None]:
!gsutil -m cp {' '.join(AOU_GWAS_RESULTS.values())} .

In [None]:
!gsutil -m cp {' '.join(UKB_GWAS_RESULTS.values())} .

In [None]:
!ls -lh *.regenie

## Fix ID column

Fix UKB ids to use `chr_` format and also expand the ids for indels.

In [None]:
!head aou_alpha3_lipids_regenie_step2_HDL_norm.regenie

In [None]:
!head ukb_200kwes_lipids_regenie_step2_HDL_mg_dl_norm.regenie

In [None]:
!grep ':I:' ukb_200kwes_lipids_regenie_step2_HDL_mg_dl_norm.regenie | head

In [None]:
!grep ':D:' ukb_200kwes_lipids_regenie_step2_HDL_mg_dl_norm.regenie | head

In [None]:
FIXED_ID_COL = 'FIXED_ID'

In [None]:
BASES = set(['A', 'C', 'G', 'T'])

def expand_indel_ids(row):
    try:
        id_fields = row['ID'].split(':')
        prefix = f'chr{id_fields[0]}_{id_fields[1]}'
        
        # ID holds bases for the ref allele.
        if set(id_fields[2]).issubset(BASES):
            assert set(id_fields[3]).issubset(BASES), f'{row["ID"]}: {row["ALLELE0"]} {row["ALLELE1"]}'
            # Just return ID in chr#_ format.
            return f'{prefix}_{id_fields[2]}_{id_fields[3]}'
        
        # ID holds abbreviated notation for indels
        change_len = round(float(id_fields[3])) # Some values were 3.01.
        allele0_len = len(row['ALLELE0'])
        allele1_len = len(row['ALLELE1'])
        if 'I' == id_fields[2]:
            if allele0_len < allele1_len:
                ref = row['ALLELE0']
                alt = row['ALLELE1']
            elif allele0_len > allele1_len:
                ref = row['ALLELE1']
                alt = row['ALLELE0']
            assert len(alt) - len(ref) == change_len, f'{row["ID"]}: {row["ALLELE0"]} {row["ALLELE1"]}'
        elif 'D' == id_fields[2]:
            if allele0_len < allele1_len:
                ref = row['ALLELE1']
                alt = row['ALLELE0']
            elif allele0_len > allele1_len:
                ref = row['ALLELE0']
                alt = row['ALLELE1']
            assert len(ref) - len(alt) == change_len, f'{row["ID"]}: {row["ALLELE0"]} {row["ALLELE1"]}'
        else:
            raise ValueError('unhandled id format')

        # Return expanded form of ID in chr#_ format .
        return f'{prefix}_{ref}_{alt}'
    
    except ValueError as e:
        print(f'failed to fix id {row["ID"]}: {row["ALLELE0"]} {row["ALLELE1"]}')
        print(e)
    except AssertionError as e:
        print(f'failed to fix id {row["ID"]}: {row["ALLELE0"]} {row["ALLELE1"]}')
        print(e)


In [None]:
for gs_file in UKB_GWAS_RESULTS.values():
    file = os.path.basename(gs_file)
    df = pd.read_csv(file, sep=' ')
    df[FIXED_ID_COL] = df.apply(expand_indel_ids, axis=1)
    df.to_csv(file + '_' + FIXED_ID_COL, index=False)

In [None]:
!head ukb_200kwes_lipids_regenie_step2_HDL_mg_dl_norm.regenie_{FIXED_ID_COL}

# METAL - Meta Analysis

In [None]:
def run_metal(lipid):
    aou_file = os.path.basename(AOU_GWAS_RESULTS[lipid])
    # Use the file with the IDs in the 'chr' format.
    ukb_file = os.path.basename(UKB_GWAS_RESULTS[lipid]) + '_' + FIXED_ID_COL
    
    
    metal_parameters = f'''
SCHEME STDERR
AVERAGEFREQ ON
MINMAXFREQ ON

MARKER ID
ALLELE ALLELE0 ALLELE1
EFFECT BETA
STDERR SE
PVALUE Pvalue
FREQ A1FREQ
SEPARATOR WHITESPACE
PROCESS {aou_file}

MARKER {FIXED_ID_COL}
ALLELE ALLELE0 ALLELE1
EFFECT BETA
STDERR SE
PVALUE Pvalue
FREQ A1FREQ
SEPARATOR COMMA
PROCESS {ukb_file}

OUTFILE METAANALYSIS_{lipid}_ .tbl
ANALYZE

QUIT
'''
    print(f'Metal parameters:\n{metal_parameters}')
    
    metal_parameters_filename = f'METAL_{lipid}.txt'
    with open(metal_parameters_filename, 'w') as param_file:
        param_file.write(metal_parameters)
        
    !./generic-metal/metal {metal_parameters_filename}

In [None]:
run_metal('HDL')

In [None]:
run_metal('LDL')

In [None]:
run_metal('TC')

In [None]:
run_metal('TG')

In [None]:
!ls -lth METAANALYSIS_*

# Check that fixed ids are a subset of the unfixed ids

In [None]:
prior_results = pd.read_table(METAL_RESULTS_BEFORE_INDEL_ID_FIXES)

prior_results.head()

In [None]:
current_results = pd.read_table('METAANALYSIS_LDL_1.tbl')

current_results.head()

In [None]:
prior_ids = set(prior_results['MarkerName'].to_list())
len(prior_ids)

In [None]:
current_ids = set(current_results['MarkerName'].to_list())
len(current_ids)

In [None]:
assert len(prior_ids) > len(current_ids)

In [None]:
newer_ids = current_ids - prior_ids
len(newer_ids)

In [None]:
newer_results = current_results[current_results['MarkerName'].isin(newer_ids)]

newer_results.shape

In [None]:
newer_ids

In [None]:
newer_results.groupby('Direction').count()

In [None]:
'?+', '?-'

In [None]:
assert set(newer_results['Direction'].to_list()) == set(['?+', '?-']), 'check that all "new" ids are from UKB only'

# Store outputs in the workspace bucket

In [None]:
!gsutil -m cp METAANALYSIS_* {METAL_OUTPUTS}

In [None]:
!gsutil ls -lh {METAL_OUTPUTS}

In [None]:
end = datetime.now()
print(end)
print(end - start)

# Provenance

In [None]:
!date