## Index
- [Intersect two datasets](#Intersecting-HGDP+1kG-unrelateds-with-GGV)
- [Apply gnomAD RF model HGDP+1kG-GGV intersect](#Use-gnomAD-RF-and-apply-to-HGDP+1kG+GGV-intersect)
- [Build RF model using HGDP+1KG and apply it to new dataset](#Build-a-RF-with-1kG-+-HGDP-(unrelateds)-using-genetic-region-labels,-apply-it-to-a-new-dataset-GGV)

In [1]:
import hail as hl
import pickle
import pandas as pd
from gnomad.sample_qc.ancestry import assign_population_pcs, pc_project
from sklearn.ensemble import RandomForestClassifier
from typing import Tuple
from bokeh.io import show, output_notebook, output_file
from bokeh.layouts import column, row
from bokeh.plotting import figure
from bokeh.models.widgets import Panel, Tabs
from bokeh.models import ColumnDataSource, Legend, TableColumn, DataTable
from bokeh.transform import factor_cmap
output_notebook()

# 1. Intersecting HGDP+1kG unrelateds with GGV

In [2]:
# use large HGDP+1KG
mt_unrel = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/intermediate_files/pre_running_varqc.mt',
                               _n_partitions=500)
#mt_unrel = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/unrelated.mt')
print(f'Number of variants in HGDP+1KG before intersecting: {mt_unrel.count_rows()}')
mt_ggv = hl.read_matrix_table('gs://gnomaf/gambian-genomes/COMBINED_GVCFS/gambian_genomes_merged_gvcfs.mt',
                             _n_partitions=500)
# GGV dataset is a sparse MT from combining GVCFs. Hail still keeps the non-variant sites (contain only REF allele)
# so we have to filter to variant-sites only
mt_ggv = mt_ggv.filter_rows(hl.len(mt_ggv.alleles) > 1)
print(f'Number of variant sites only in GGV before intersecting: {mt_ggv.count_rows()}')
# only variants sites

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.2
SparkUI available at http://qc-notebook4-m.c.diverse-pop-seq-ref.internal:32861
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.95-513139587f57
LOGGING: writing to /home/hail/hail-20220812-2034-0.2.95-513139587f57.log


Number of variants in HGDP+1KG before intersecting: 155648020
Number of variant sites only in GGV before intersecting: 61164017


### In order to combine two datasets, three requirements must be met:

1. The row keys must match.

2. The column key schemas and column schemas must match.

3. The entry schemas must match.

In [3]:
mt_unrel_unkeyed = mt_unrel.key_cols_by().key_rows_by()
mt_unrel_clean = mt_unrel_unkeyed.select_cols(mt_unrel_unkeyed.s)
mt_unrel_clean = mt_unrel_clean.select_rows(
    mt_unrel_clean.locus, mt_unrel_clean.alleles, mt_unrel_clean.rsid)
mt_unrel_clean = mt_unrel_clean.select_entries(
    mt_unrel_clean.GT)

# put back the keys
mt_unrel_clean = mt_unrel_clean.key_cols_by('s').key_rows_by(*['locus', 'alleles'])
hgdp_tgp_samples = mt_unrel_clean.s.collect()

In [4]:
mt_ggv_unkeyed = mt_ggv.key_cols_by().key_rows_by()
mt_ggv_clean = mt_ggv_unkeyed.select_cols(mt_ggv_unkeyed.s)
mt_ggv_clean = mt_ggv_clean.select_rows(
    mt_ggv_clean.locus, mt_ggv_clean.alleles, mt_ggv_clean.rsid)
mt_ggv_clean = mt_ggv_clean.select_entries(GT = mt_ggv_clean.LGT)

# put back the keys
mt_ggv_clean = mt_ggv_clean.key_cols_by('s').key_rows_by(*['locus', 'alleles'])

# collect GGV samples to list so we can later use this to check how they were classified by the RF model
ggv_samples = mt_ggv_clean.s.collect()

In [5]:
hgdp_tgp_ggv_intersect = mt_unrel_clean.union_cols(mt_ggv_clean)

In [6]:
hgdp_tgp_ggv_intersect = hgdp_tgp_ggv_intersect.checkpoint('gs://hgdp-1kg/hgdp_tgp_ggv_intersect.mt')

2022-08-12 20:41:50 Hail: INFO: Coerced sorted dataset
2022-08-12 20:44:10 Hail: INFO: Coerced sorted dataset
2022-08-12 20:50:45 Hail: INFO: wrote matrix table with 26452039 rows and 4513 columns in 996 partitions to gs://hgdp-1kg/hgdp_tgp_ggv_intersect.mt


In [7]:
hgdp_tgp_ggv_intersect = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp_ggv_intersect.mt')

In [8]:
print(f'Number of variants after intersecting HGDP+1KG with GGV: {hgdp_tgp_ggv_intersect.count_rows()}')

Number of variants after intersecting HGDP+1KG with GGV: 26452039


# 2. Use gnomAD RF and apply to HGDP+1kG+GGV intersect

In [9]:
# gnomAD loadings Hail Table
loadings_ht = hl.read_table('gs://gcp-public-data--gnomad/release/3.1/pca/gnomad.v3.1.pca_loadings.ht')

# Project new genotypes onto loadings
ht = hl.experimental.pc_project(
    hgdp_tgp_ggv_intersect.GT,
    loadings_ht.loadings,
    loadings_ht.pca_af,
)

2022-08-12 20:51:02 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [10]:
print(f'Number of variants in gnomAD loadings: {loadings_ht.count()}')

Number of variants in gnomAD loadings: 76399


In [11]:
hgdp_tgp_ggv_intersect = hgdp_tgp_ggv_intersect.annotate_rows(
        pca_loadings=loadings_ht[hgdp_tgp_ggv_intersect.row_key]['loadings'],
        pca_af=loadings_ht[hgdp_tgp_ggv_intersect.row_key]['pca_af'],
    )

In [12]:
# Get the number of variants found in gnomAD loadings and hgdp_tgp_ggv_intersect
# the higher the missingness, the less accurate the classification will be 
gnomad_loadings_data_interset_count = hgdp_tgp_ggv_intersect.filter_rows(hl.is_defined(hgdp_tgp_ggv_intersect.pca_loadings)
                                   & hl.is_defined(hgdp_tgp_ggv_intersect.pca_af)).count_rows()

In [13]:
print(f'Number of variants common between HGDP+1KG+GGV & gnomAD RF: {gnomad_loadings_data_interset_count}')

Number of variants common between HGDP+1KG+GGV & gnomAD RF: 39557


In [14]:
# Load gnomAD RF model
with hl.hadoop_open('gs://gcp-public-data--gnomad/release/3.1/pca/gnomad.v3.1.RF_fit.pkl', 'rb') as f:
    fit = pickle.load(f)



In [15]:
# Reduce the scores to only those used in the RF model, this was 6 for v2 and 16 for v3.1
num_pcs = fit.n_features_
ht = ht.annotate(scores=ht.scores[:num_pcs])

# assign population labels based on PCA results
ht, rf_model = assign_population_pcs(
    ht,
    pc_cols=[(i + 1) for i in range(num_pcs)],
    fit=fit,
)

2022-08-12 20:53:12 Hail: INFO: Coerced sorted dataset
INFO (gnomad.sample_qc.ancestry 230): Found the following sample count after population assignment: oth: 2989, amr: 366, afr: 1108, sas: 49, nfe: 1


In [16]:
gnomad_rf_output = ht.transmute(**{f'PC{i}': ht.pca_scores[i - 1] for i in range(1, num_pcs+1)})
gnomad_rf_output = gnomad_rf_output.to_pandas()
gnomad_rf_output['pop'] = gnomad_rf_output['pop'].str.upper()

2022-08-12 20:53:44 Hail: INFO: Coerced sorted dataset


### Plotting

In [17]:
color_map = {'AFR': "#984EA3", 'EAS': "#4DAF4A", 'EUR': "#377EB8", 'CSA': "#FF7F00",
             'AMR': "#E41A1C", 'MID': "#A65628", 'OCE': "#000000", 'OTH': "#F0E442"}

tabs1 = []

ref_samples_df1 = gnomad_rf_output[gnomad_rf_output['s'].isin(hgdp_tgp_samples)]
ggv_samples_df1 = gnomad_rf_output[gnomad_rf_output['s'].isin(ggv_samples)]

def plot_pca(ref_df=None, data_df=None, pc1=None, pc2=None):
    pref = figure(width=600, height=500, background_fill_color='#fafafa', title = 'HGDP+1KG')
    pref.add_layout(Legend(), 'right')
    pref.xaxis.axis_label = pc1
    pref.yaxis.axis_label = pc2
    
    pdata = figure(width=600, height=500, background_fill_color='#fafafa', title = 'GGV')
    pdata.add_layout(Legend(), 'right')
    pdata.xaxis.axis_label = pc1
    pdata.yaxis.axis_label = pc2
    
    pcomb = figure(width=600, height=500, background_fill_color='#fafafa', title = 'HGDP+1KG+GGV')
    pcomb.add_layout(Legend(), 'right')
    pcomb.xaxis.axis_label = pc1
    pcomb.yaxis.axis_label = pc2
    pcomb.circle(ref_df[pc1].tolist(), ref_df[pc2].tolist(), size=3, color='grey', alpha=0.3)

    for pop, col in color_map.items():
        # reference
        pref.circle(ref_df[(ref_df['pop'] == pop)][pc1].tolist(), ref_df[(ref_df['pop'] == pop)][pc2].tolist(),
                    size=3, color=col, alpha=0.8, legend_label=pop)
        
        # data
        pdata.circle(data_df[(data_df['pop'] == pop)][pc1].tolist(), data_df[(data_df['pop'] == pop)][pc2].tolist(),
                     size=3, color=col, alpha=0.8, legend_label=pop)
        
        # ref+data combined
        pcomb.circle(data_df[(data_df['pop'] == pop)][pc1].tolist(), data_df[(data_df['pop'] == pop)][pc2].tolist(),
                     size=3, color=col, alpha=0.8, legend_label=pop)
        
    return pref, pdata, pcomb


for i in range(1, num_pcs, 2):
    xpc = f'PC{i}'
    ypc = f'PC{i + 1}'
    
    p1, p2, p3 = plot_pca(ref_df=ref_samples_df1, data_df=ggv_samples_df1, pc1=xpc, pc2=ypc)
        
    tab = Panel(child=column(row(p1, p2), row(p3)), title=f'{xpc}v{ypc}')

    tabs1.append(tab)

In [18]:
show(Tabs(tabs=tabs1))


# 3. Build a RF with 1kG + HGDP (unrelateds) using genetic region labels, apply it to a new dataset GGV

In [20]:
def intersect_ref(ref_mt: hl.MatrixTable = None, data_mt: hl.MatrixTable = None):
    data_in_ref = data_mt.filter_rows(hl.is_defined(ref_mt.rows()[data_mt.row_key]))
    print('sites in ref and data, inds in data: {}'.format(data_in_ref.count()))

    ref_in_data = ref_mt.filter_rows(hl.is_defined(data_mt.rows()[ref_mt.row_key]))
    print('sites in ref and data, inds in ref: {}'.format(ref_in_data.count()))
    
    return ref_in_data, data_in_ref


def run_ref_pca(mt: hl.MatrixTable = None, npcs: int = 20):
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=npcs, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    # individual-level PCs
    pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, npcs+1)})
    
    return pca_loadings, pca_scores


def merge_data_with_ref(ref_scores: hl.Table = None,
        ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv',
        data_scores: hl.Table = None) -> pd.DataFrame:
    print('Merging data with ref')
    ref_info = hl.import_table(ref_info,
                           impute=True, key='Sample')
    ref_merge = ref_scores.annotate(SuperPop = ref_info[ref_scores.s].SuperPop)

    print('merging data and ref data')
    data_ref = ref_merge.union(data_scores, unify=True)
    print('Done merging data with ref')

    return data_ref


In [21]:
# use pruned MT to speed up things
mt_unrel = hl.read_matrix_table('gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/unrelated.mt',
                               _n_partitions=500)

In [22]:
hgdp_tgp_in_ggv_mt, ggv_in_hgdp_tgp_mt = intersect_ref(ref_mt=mt_unrel, data_mt=mt_ggv)

sites in ref and data, inds in data: (211741, 394)
sites in ref and data, inds in ref: (211741, 3380)


In [23]:
ref_pca_loadings, ref_pca_scores = run_ref_pca(mt=hgdp_tgp_in_ggv_mt, npcs=20)

2022-08-12 21:00:23 Hail: INFO: hwe_normalize: found 211741 variants after filtering out monomorphic sites.
2022-08-12 21:01:44 Hail: INFO: pca: running PCA with 20 components...
2022-08-12 21:05:55 Hail: INFO: Coerced sorted dataset


In [24]:
# project data
# the gnomAD pc_project function requires genotype to be encoded as GT, not LGT
ggv_in_hgdp_tgp_mt = ggv_in_hgdp_tgp_mt.select_entries(GT = ggv_in_hgdp_tgp_mt.LGT)

data_projections_ht = pc_project(mt=ggv_in_hgdp_tgp_mt, loadings_ht=ref_pca_loadings,
                                 loading_location='loadings', af_location='pca_af')

data_scores = data_projections_ht.transmute(**{f'PC{i}': data_projections_ht.scores[i - 1] for i in range(1, 20+1)})

In [25]:
data_ref = merge_data_with_ref(ref_scores=ref_pca_scores, data_scores=data_scores)

data_ref_df = data_ref.to_pandas()

Merging data with ref


2022-08-12 21:06:28 Hail: INFO: Reading table to impute column types
2022-08-12 21:06:31 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'SuperPop' as type str (imputed)
  Loading field 'Project' as type str (imputed)


merging data and ref data
Done merging data with ref


2022-08-12 21:10:00 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-12 21:10:01 Hail: INFO: Coerced sorted dataset
2022-08-12 21:10:01 Hail: INFO: Coerced sorted dataset


In [26]:
ht, rf_model = assign_population_pcs(
    data_ref_df,
    pc_cols=['PC{}'.format(i + 1) for i in range(20)],
    known_col="SuperPop",
)

INFO (gnomad.sample_qc.ancestry 230): Found the following sample count after population assignment: EUR: 662, oth: 330, EAS: 718, AMR: 387, CSA: 669, AFR: 843, OCE: 27, MID: 138


Random forest feature importances are as follows: [0.18639292 0.1884225  0.15411622 0.12593267 0.10580455 0.05179108
 0.05079407 0.03028211 0.02223709 0.01260428 0.01717959 0.0051473
 0.01624743 0.00080003 0.01039461 0.00760396 0.00246636 0.00647793
 0.00432485 0.00098046]
Estimated error rate for RF model is 0.004437869822485174


### Plotting

In [27]:
color_map = {'AFR': "#984EA3", 'EAS': "#4DAF4A", 'EUR': "#377EB8", 'CSA': "#FF7F00",
             'AMR': "#E41A1C", 'MID': "#A65628", 'OCE': "#000000", 'oth': "#F0E442"}

tabs2 = []

ref_samples_df2 = data_ref_df[data_ref_df['s'].isin(hgdp_tgp_samples)]
ggv_samples_df2 = data_ref_df[data_ref_df['s'].isin(ggv_samples)]

def plot_pca(ref_df=None, data_df=None, pc1=None, pc2=None):
    
    pcomb = figure(width=600, height=500, background_fill_color='#fafafa', title = 'Reference+Projected')
    pcomb.add_layout(Legend(), 'right')
    pcomb.xaxis.axis_label = pc1
    pcomb.yaxis.axis_label = pc2
    pcomb.circle(ref_df[pc1].tolist(), ref_df[pc2].tolist(), size=3, color='grey', alpha=0.8)

    for pop, col in color_map.items():
        # ref+data combined
        pcomb.circle(data_df[(data_df['pop'] == pop)][pc1].tolist(), data_df[(data_df['pop'] == pop)][pc2].tolist(),
                     size=3, color=col, alpha=0.8, legend_label=pop)
        
    return pcomb

for i in range(1, 20, 2):
    xpc = f'PC{i}'
    ypc = f'PC{i + 1}'
    
    plot1 = plot_pca(ref_df=ref_samples_df2, data_df=ggv_samples_df2, pc1=xpc, pc2=ypc)
        
    tab = Panel(child=plot1, title=f'{xpc}v{ypc}')

    tabs2.append(tab)

In [28]:
show(Tabs(tabs=tabs2))


In [29]:
# Get counts by POP
ggv_samples_df2['pop'].value_counts()

AFR    394
Name: pop, dtype: int64