In [1]:
# Import packages.
import intake
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Open the Ag1000G phase 3 data catalog.
cat = intake.open_catalog("https://malariagen.github.io/intake/gcs.yml")

In [3]:
def load_sample_metadata(sample_set, 
                         include_aim_species_calls=True, 
                         include_pca_species_calls=False, 
                         species_analysis="species_calls_20200422"):
    """Load sample metadata, optionally including species calls.

    Parameters
    ----------
    sample_set : str
        Sample set.
    include_aim_species_calls : bool
        If True, include AIM calls.
    include_pca_species_calls : bool
        If True, include PCA calls.
    species_analysis : str
        Species analysis.

    Returns
    -------
    df : pandas.DataFrame

    Notes
    -----
    If both AIMs and PCA are requested, species calls columns are appended with 
    "_aim" and "_pca" respectively.

    """

    if isinstance(sample_set, str):

        df = cat.ag3.samples(sample_set=sample_set).read()
        df["sample_set"] = sample_set

        if include_aim_species_calls:
            df_aim = cat.ag3[f"{species_analysis}_aim"](sample_set=sample_set).read()
            
        if include_pca_species_calls:
            df_pca = cat.ag3[f"{species_analysis}_pca"](sample_set=sample_set).read()

        df_species = None

        if include_aim_species_calls and include_pca_species_calls:
            df_species = df_aim.merge(df_pca, on="sample_id", lsuffix="_aim", rsuffix="_pca", sort=False)
            
        elif include_aim_species_calls:
            df_species = df_aim
            
        elif include_pca_species_calls:
            df_species = df_species

        if df_species is not None:
            df = df.merge(df_species, on="sample_id", sort=False)
    
        return df

    elif isinstance(sample_set, (list, tuple)):

        return pd.concat(
            [load_sample_metadata(
                sample_set=s, 
                include_aim_species_calls=include_aim_species_calls, 
                include_pca_species_calls=include_pca_species_calls, 
                species_analysis=species_analysis) 
             for s in sample_set],
            axis=0, sort=False)
    
    else:
        raise TypeError("Type of `sample_set` must be string or list of strings")
    

In [4]:
df_sample_sets = cat.ag3.sample_sets.read()

  import pandas.util.testing as tm


In [5]:
all_wild = [x for x in df_sample_sets.sample_set if x != "AG1000G-X"]
all_wild

['AG1000G-AO',
 'AG1000G-BF-A',
 'AG1000G-BF-B',
 'AG1000G-BF-C',
 'AG1000G-CD',
 'AG1000G-CF',
 'AG1000G-CI',
 'AG1000G-CM-A',
 'AG1000G-CM-B',
 'AG1000G-CM-C',
 'AG1000G-FR',
 'AG1000G-GA-A',
 'AG1000G-GH',
 'AG1000G-GM-A',
 'AG1000G-GM-B',
 'AG1000G-GM-C',
 'AG1000G-GN-A',
 'AG1000G-GN-B',
 'AG1000G-GQ',
 'AG1000G-GW',
 'AG1000G-KE',
 'AG1000G-ML-A',
 'AG1000G-ML-B',
 'AG1000G-MW',
 'AG1000G-MZ',
 'AG1000G-TZ',
 'AG1000G-UG']

In [6]:
df_meta = load_sample_metadata(sample_set=all_wild)
# call species
df_meta['species'] = 'unknown'
df_meta.loc[df_meta.species_gambcolu_arabiensis == 'arabiensis', 'species'] = 'arabiensis'
df_meta.loc[df_meta.species_gambcolu_arabiensis == 'intermediate', 'species'] = 'intermediate_arabiensis_gambiae'
df_meta.loc[(df_meta.species_gambcolu_arabiensis == 'gamb_colu') & (df_meta.species_gambiae_coluzzii == 'gambiae'), 'species'] = 'gambiae'
df_meta.loc[(df_meta.species_gambcolu_arabiensis == 'gamb_colu') & (df_meta.species_gambiae_coluzzii == 'coluzzii'), 'species'] = 'coluzzii'
df_meta.loc[(df_meta.species_gambcolu_arabiensis == 'gamb_colu') & (df_meta.species_gambiae_coluzzii == 'intermediate'), 'species'] = 'intermediate_gambiae_coluzzii'
# special-case locations for scattered sample sets
df_meta.loc[df_meta.sample_set == 'AG1000G-ML-B', 'location'] = 'misc'
df_meta.loc[df_meta.sample_set == 'AG1000G-CM-B', 'location'] = 'misc'
df_meta.loc[df_meta.sample_set == 'AG1000G-CM-C', 'location'] = 'misc'
df_meta.loc[df_meta.sample_set == 'AG1000G-FR', 'location'] = 'misc'
df_size = pd.DataFrame(df_meta.fillna("").groupby(by=["country", "location", "species", "year"]).size(), columns=['n_samples'])


In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_size)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n_samples
country,location,species,year,Unnamed: 4_level_1
Angola,Luanda,coluzzii,2009,81
Burkina Faso,Bana,arabiensis,2014,1
Burkina Faso,Bana,coluzzii,2012,42
Burkina Faso,Bana,coluzzii,2014,47
Burkina Faso,Bana,gambiae,2012,22
Burkina Faso,Bana,gambiae,2014,15
Burkina Faso,Bana,intermediate_gambiae_coluzzii,2012,1
Burkina Faso,Monomtenga,gambiae,2004,13
Burkina Faso,Pala,arabiensis,2014,2
Burkina Faso,Pala,coluzzii,2012,11


In [8]:
df_meta.head()

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,species
0,AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.945,0.001,gamb_colu,coluzzii,coluzzii
1,AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.933,0.001,gamb_colu,coluzzii,coluzzii
2,AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.937,0.002,gamb_colu,coluzzii,coluzzii
3,AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.938,0.002,gamb_colu,coluzzii,coluzzii
4,AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.926,0.001,gamb_colu,coluzzii,coluzzii


In [9]:
df_meta.set_index('sample_id', inplace=True)

In [10]:
df_meta.loc['AN0131-C']

partner_sample_id                  CM0901776
contributor                    Nora Besansky
country                             Cameroon
location                               Mayos
year                                    2009
month                                      9
latitude                               4.341
longitude                             13.558
sex_call                                   M
sample_set                      AG1000G-CM-A
aim_fraction_colu                      0.021
aim_fraction_arab                      0.003
species_gambcolu_arabiensis        gamb_colu
species_gambiae_coluzzii             gambiae
species                              gambiae
Name: AN0131-C, dtype: object

In [11]:
df_meta.loc['AB0252-C']

partner_sample_id                     BF11-29
contributor                       Austin Burt
country                          Burkina Faso
location                       Souroukoudinga
year                                     2012
month                                       7
latitude                               11.235
longitude                              -4.535
sex_call                                    F
sample_set                       AG1000G-BF-A
aim_fraction_colu                       0.012
aim_fraction_arab                       0.002
species_gambcolu_arabiensis         gamb_colu
species_gambiae_coluzzii              gambiae
species                               gambiae
Name: AB0252-C, dtype: object

In [50]:
samples = [
    'AN0131-C', # used previously
    'AB0252-C', # used previously
    'AN0326-C', # only 1 lane
    'AN0280-Cx', # 4 lanes
    'AA0052-C', # GH coluzzii
    'AR0078-C', # AO coluzzii
    'AJ0037-C', # GW intermediate
    'BL0358-C', # TZ intermediate
    'AC0010-C', # UG arabiensis 
    'AZ0156-C', # MW arabiensis
]

In [51]:
!head ~/github/malariagen/vector-ops/tracking/AG1000G-AO/agam.fofn.tsv

path	sample	library	study	ena_run
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#37.bam	AR0047-C	7206399	1087-AN-HAPMAP-DONNELLY	ERR317254
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#38.bam	AR0091-C	7206411	1087-AN-HAPMAP-DONNELLY	ERR317255
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#39.bam	AR0049-C	7206423	1087-AN-HAPMAP-DONNELLY	ERR317256
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#40.bam	AR0051-C	7206435	1087-AN-HAPMAP-DONNELLY	ERR317257
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#41.bam	AR0061-C	7206352	1087-AN-HAPMAP-DONNELLY	ERR317258
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#42.bam	AR0078-C	7206364	1087-AN-HAPMAP-DONNELLY	ERR317259
/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/9790_4#43.bam	AR0080-C	7206376	1087-AN-HAPMAP-DONNELLY	ERR317260
/lustre/scratch118/malaria/team112/pipelines/setu

In [52]:
import glob

In [53]:
df_fofn = pd.concat(
    [pd.read_csv(f, sep='\t') for f in glob.glob('github/malariagen/vector-ops/tracking/*/agam.fofn.tsv')], 
    ignore_index=True
)
df_fofn.head()

Unnamed: 0,path,sample,library,study,ena_run
0,/lustre/scratch118/malaria/team112/pipelines/s...,AV0069-CW,18762288,1129-AG-MULTI-VERNICK,ERR2643782
1,/lustre/scratch118/malaria/team112/pipelines/s...,AV0126-CW,18762300,1129-AG-MULTI-VERNICK,ERR2643783
2,/lustre/scratch118/malaria/team112/pipelines/s...,AV0209-CW,18762312,1129-AG-MULTI-VERNICK,ERR2643784
3,/lustre/scratch118/malaria/team112/pipelines/s...,AV0212-CW,18762324,1129-AG-MULTI-VERNICK,ERR2643785
4,/lustre/scratch118/malaria/team112/pipelines/s...,AV0235-CW,18762336,1129-AG-MULTI-VERNICK,ERR2643786


In [54]:
df_fofn['sample'].isin(samples)

0        False
1        False
2        False
3        False
4        False
         ...  
17676    False
17677    False
17678    False
17679    False
17680    False
Name: sample, Length: 17681, dtype: bool

In [55]:
df_fofn['path'][0]

'/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/22172_6#1.cram'

In [56]:
df_fofn['path'].str.replace(
    '/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/(\\d+)_(\\d+)#(\\d+)\.(\w+)', 
    '/seq/\\1/\\1_\\2#\\3.\\4', 
    regex=True)

0         /seq/22172/22172_6#1.cram
1         /seq/22172/22172_6#2.cram
2         /seq/22172/22172_6#3.cram
3         /seq/22172/22172_6#4.cram
4         /seq/22172/22172_6#5.cram
                    ...            
17676    /seq/22125/22125_7#23.cram
17677    /seq/22125/22125_8#23.cram
17678    /seq/22125/22125_6#28.cram
17679    /seq/22125/22125_7#28.cram
17680    /seq/22125/22125_8#28.cram
Name: path, Length: 17681, dtype: object

In [57]:
df_fofn.groupby('sample').size().sort_values()

sample
AN0326-C     1
AN0368-C     1
AN0367-C     1
AN0366-C     1
AN0364-C     1
            ..
AN0230-Cx    4
AN0291-Cx    4
AN0259-Cx    4
AN0170-Cx    4
AN0280-Cx    4
Length: 5909, dtype: int64

In [58]:
df_out = (
    df_fofn
    .loc[df_fofn['sample'].isin(samples)]
    .copy()
)
df_out['irods_path'] = df_out['path'].str.replace(
    '/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/input/(\\d+)_(\\d+)#(\\d+)\.(\w+)', 
    '/seq/\\1/\\1_\\2#\\3.\\4', 
    regex=True)
df_out = (
    df_out[['sample', 'ena_run', 'irods_path']]
    .rename({'sample': 'sample_id', 'ena_run': 'run_ena'}, axis=1)
)
df_out

Unnamed: 0,sample_id,run_ena,irods_path
1980,BL0358-C,ERR1638499,/seq/20307/20307_2#27.cram
1981,BL0358-C,ERR1638535,/seq/20307/20307_3#27.cram
1982,BL0358-C,ERR1638571,/seq/20307/20307_4#27.cram
2664,AA0052-C,ERR376538,/seq/10798/10798_1#6.cram
2715,AA0052-C,ERR387788,/seq/10843/10843_1#6.cram
2720,AA0052-C,ERR387800,/seq/10843/10843_2#6.cram
3443,AN0326-C,ERR1373082,/seq/18874/18874_6#12.cram
4100,AN0131-C,ERR317337,/seq/9812/9812_4#48.bam
4101,AN0131-C,ERR340933,/seq/10209/10209_3#48.bam
4102,AN0131-C,ERR340945,/seq/10209/10209_4#48.bam


In [59]:
import sys

In [63]:
df_out.to_csv(sys.stdout, index=False, sep='\t')

sample_id	run_ena	irods_path
BL0358-C	ERR1638499	/seq/20307/20307_2#27.cram
BL0358-C	ERR1638535	/seq/20307/20307_3#27.cram
BL0358-C	ERR1638571	/seq/20307/20307_4#27.cram
AA0052-C	ERR376538	/seq/10798/10798_1#6.cram
AA0052-C	ERR387788	/seq/10843/10843_1#6.cram
AA0052-C	ERR387800	/seq/10843/10843_2#6.cram
AN0326-C	ERR1373082	/seq/18874/18874_6#12.cram
AN0131-C	ERR317337	/seq/9812/9812_4#48.bam
AN0131-C	ERR340933	/seq/10209/10209_3#48.bam
AN0131-C	ERR340945	/seq/10209/10209_4#48.bam
AN0280-Cx	ERR491177	/seq/12169/12169_6#23.bam
AN0280-Cx	ERR502102	/seq/12274/12274_2#23.bam
AN0280-Cx	ERR506153	/seq/12298/12298_3#23.bam
AN0280-Cx	ERR506165	/seq/12298/12298_4#23.bam
AB0252-C	ERR338386	/seq/10061/10061_1#68.bam
AB0252-C	ERR338398	/seq/10061/10061_2#68.bam
AB0252-C	ERR327108	/seq/9953/9953_2#68.bam
AZ0156-C	ERR1584022	/seq/19979/19979_1#20.cram
AZ0156-C	ERR1584058	/seq/19979/19979_2#20.cram
AZ0156-C	ERR1584094	/seq/19979/19979_3#20.cram
AR0078-C	ERR317259	/seq/9790/9790_4#42.bam
AR0078-C	ERR31