In [2]:
# imports

import numpy as np
import pandas as pd


# Data Sources

This notebook documents the compilation of metadata products used as input to the analyses included in the project.
- All external data were obtained from publically available sources. URLs and download instructions are included to aid in reproducing the provenance of these data. 
- All data generated by the Armbrust lab were posted to publically available resources. URLs and download instructions are included for these data as well.
- The included code combines and cleans data sets, documents the reconciliation of any discrepancies, and produces cleaned metadata files that are used as inputs for project analyses.

## Reference Genomes & CyCOGs

All reference genomes in this project were derived from the Scientific Data publication [Single cell genomes of Prochlorococcus, Synechococcus, and sympatric microbes from diverse marine environments](https://www.nature.com/articles/sdata2018154) (Berube et al., 2018). The genome sequences and annotations used in this publication are hosted by JGI on the [IMG/ProPortal](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi) and can be downloaded using the [IMG Genome Cart tool](https://img.jgi.doe.gov/cgi-bin/mer/main.cgi?section=GenomeCart&page=genomeCart)

### Figshare Files

Download the following files from the [figshare site associated with Berube et al. (2018)](https://doi.org/10.6084/m9.figshare.c.4037048.v1):
- File 4: [cycogs-genomes.tsv](https://figshare.com/articles/dataset/File_4_CyCOG_taxa/6007166)
- File 5: [cycogs.tsv](https://figshare.com/articles/dataset/File_5_CyCOG_definitions/6007169)
- File 14: [genome_assembly_summary_20180718.tsv](https://figshare.com/articles/dataset/File_14_Genome_assembly_summary/6281519)



# CACHE

I don't think I need this stuff, but stashing here in case

### Publication Files
- [Table 1](https://www.nature.com/articles/sdata2018154/tables/2)
- [Table 2](https://www.nature.com/articles/sdata2018154/tables/3)


### JGI IMG/ProPortal Files

Download the metadata associated with all ProPortal reference genomes from the [IMG/ProPortal Genome Browser](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi?section=ProPortal&page=genomeList&class=datamart)
- Use the `Table Configuration` section (bottom of the page) to display the following fields: 
    - Domain
    - Sequencing Status
    - Study Name
    - Genome Name / Sample Name
    - Sequencing Center
    - IMG Genome ID (IMG Taxon ID)
    - Genus
    - Ecosystem Subtype
    - Culture Type
    - Depth In Meters
    - Latitude
    - Longitude
    - Proportal Clade
    - Proportal Ecotype
    - Genome Size (Number of total bases)
    - Gene Count (Number of total genes)
    - CheckM2 Completeness
- Click `Select All` and `Export` to download the metadata
    - Name the file `proportal-metadata.tsv` to make it recognizeable by the following scripts
    

In [47]:
lister = genome_df['clade'].sort_values().unique()

dict(zip(lister, lister))

{'5.1A-1': '5.1A-1',
 '5.1A-I': '5.1A-I',
 '5.1A-II': '5.1A-II',
 '5.1A-III': '5.1A-III',
 '5.1A-III/XV': '5.1A-III/XV',
 '5.1A-IV': '5.1A-IV',
 '5.1B-VII': '5.1B-VII',
 'CDR2': 'CDR2',
 'HLI': 'HLI',
 'HLII': 'HLII',
 'HLIII': 'HLIII',
 'HLVI': 'HLVI',
 'LLI': 'LLI',
 'LLII/III': 'LLII/III',
 'LLIV': 'LLIV',
 'unclassified': 'unclassified',
 nan: nan}

In [14]:
genome_df[['IID', 'GROUP', 'IMG_ID', 'Completeness', 'checkm_completeness', 'usage_notes', 
           'ecotype', 'clade', 
           'selection_criteria', 'checkm_contamination', 'annotation']].sort_values(
    'usage_notes', ascending=False
)

Unnamed: 0,IID,GROUP,IMG_ID,Completeness,checkm_completeness,usage_notes,ecotype,clade,selection_criteria,checkm_contamination,annotation
87,AG-341-K05,Prochlorococcus,2716884261,8.62,8.62,likely virocell,Low light adapted (LL),LLI,wgs amplification,0.00,JGI Microbial Genome Annotation Pipeline (MGAP...
209,AG-363-L17,Prochlorococcus,2667527373,57.20,57.20,likely virocell,High light adapted (HL),HLVI,wgs amplification,0.54,JGI Microbial Genome Annotation Pipeline (MGAP...
332,AG-418-C09,Prochlorococcus,2716884766,91.30,91.30,likely virocell,High light adapted (HL),unclassified,ITS phylogeny,0.54,JGI Microbial Genome Annotation Pipeline (MGAP...
333,AG-418-C17,Prochlorococcus,2716884767,52.47,52.47,likely virocell,High light adapted (HL),HLII,ITS phylogeny,0.82,JGI Microbial Genome Annotation Pipeline (MGAP...
0,AG-311-D23,Prochlorococcus,2716884681,72.96,72.96,,Low light adapted (LL),LLI,wgs amplification,0.00,JGI Microbial Genome Annotation Pipeline (MGAP...
...,...,...,...,...,...,...,...,...,...,...,...
765,S-ShM2,Virus,651703106,0.00,,,,,,,
766,Syn19,Virus,651703107,0.00,,,,,,,
767,Syn5,Virus,641201056,0.00,,,,,,,
768,metaG-MbCM1,Virus,2595698410,0.00,,,,,,,


In [5]:
genome_df.columns

Index(['IID', 'GROUP', 'IMG_ID', 'TYPE', 'JGI_GENOMEPORTAL_NAME',
       'Completeness', 'sag_id', 'phylogeny', 'ecotype', 'clade',
       'img_genome_id', 'usage_notes', 'selection_criteria',
       'completeness_software', 'completeness_score', 'checkm_completeness',
       'checkm_contamination', 'checkm_strain_heterogeneity',
       'checkm_genome_size--bp', 'checkm_scaffold_number', 'checkm_percent_gc',
       'checkm_predicted_genes', 'checkm_tree_placement_taxonomy',
       'checkm_marker_set_used_for_stats', '5S_rRNA_count', '16S_rRNA_count',
       '23S_rRNA_count', 'total_standard_tRNA_count',
       'non-redundant_standard_tRNA_count', 'tRNA_extraction_software',
       'analysis_project_type', 'taxa_id', 'assembly_software',
       'assembly_method_version', 'annotation', 'assembly_quality',
       'cell_isolation_approach', 'single_cell_lysis_approach',
       'wga_amp_approach', 'wga_amp_protocol', 'ncbi_biosample_accession',
       'sequencing_coverage', 'seq_meth', 'env

# Clean Genome Metadata

In [50]:
# clean genome metadata

genome_df = pd.merge(
    left=pd.read_csv('metadata/cycogsgenomes.tsv', sep='\t'), 
    right=pd.read_csv('metadata/genome_assembly_summary_20180718.tsv', sep='\t'), 
    left_on='IMG_ID',
    right_on='img_genome_id', 
    how='left'
)

# standardize field names
genome_df.rename(
    columns={
        'IMG_ID':'id', 
        'IID':'name', 
        'GROUP':'group', 
        'TYPE':'type', 
        'Completeness':'completeness', 
    }, 
    inplace=True
)

# add virocell field
genome_df['virocell'] = (genome_df['usage_notes'] == 'likely virocell')

# fix clades
genome_df['clade'] = genome_df['clade'].map({
    '5.1A-1': '5.1A-I',
    '5.1A-I': '5.1A-I',
    '5.1A-II': '5.1A-II',
    '5.1A-III': '5.1A-III',
    '5.1A-III/XV': '5.1A-III/XV',
    '5.1A-IV': '5.1A-IV',
    '5.1B-VII': '5.1B-VII',
    'CDR2': 'CRD2',
    'HLI': 'HLI',
    'HLII': 'HLII',
    'HLIII': 'HLIII',
    'HLVI': 'HLVI',
    'LLI': 'LLI',
    'LLII/III': 'LLII/III',
    'LLIV': 'LLIV',
    'unclassified': np.nan
})

# down select only needed fields & sort
genome_df = genome_df[['id', 'name', 'type', 'group', 'ecotype', 'clade', 'virocell', 'completeness']].sort_values(
    ['group', 'clade', 'type', 'completeness'], ascending=[True, True, False, False]
).reset_index(drop=True)

# save cleaned up genome dataframe
genome_df.to_csv('metadata/genome-metadata.csv', index=False)

genome_df


Unnamed: 0,id,name,type,group,ecotype,clade,virocell,completeness
0,2716884503,AG-679-M23,SAG,Prochlorococcus,High light adapted (HL),HLI,False,99.73
1,2716884367,AG-388-E21,SAG,Prochlorococcus,High light adapted (HL),HLI,False,98.91
2,2716884409,AG-418-M08,SAG,Prochlorococcus,High light adapted (HL),HLI,False,98.51
3,2716884731,AG-388-J02,SAG,Prochlorococcus,High light adapted (HL),HLI,False,98.37
4,2716884783,AG-442-D10,SAG,Prochlorococcus,High light adapted (HL),HLI,False,96.47
...,...,...,...,...,...,...,...,...
765,651703106,S-ShM2,ISOLATE,Virus,,,False,0.00
766,651703107,Syn19,ISOLATE,Virus,,,False,0.00
767,641201056,Syn5,ISOLATE,Virus,,,False,0.00
768,2595698410,metaG-MbCM1,ISOLATE,Virus,,,False,0.00


In [5]:
# save cleaned up genome dataframe

genome_df = genome_df.drop(columns='Prefix')
genome_df.to_csv('../data/berube_pro_syn_set/genome_metadata.csv', index=False)
genome_df.to_csv('../../../data/mappings/2021-02-18/collated/berube_pro_syn_set/genome_metadata.csv', index=False)
genome_df


Unnamed: 0,BerubeProportalID,UpdatedProportalID,GenomeName,Genus,Ecotype,Clade,ReferenceType,IsolationLocation,Ecosystem,Latitude,Longitude,Depth(m),GenomeSize(bp),GeneCount,Completeness
0,2716884681,2716884681,AG-311-D23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1466304,1796,72.96
1,2716884682,2716884682,AG-311-I02,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,195290,271,11.16
2,2716884683,2716884683,AG-311-I09,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,697970,812,47.41
3,2716884684,2716884684,AG-311-J05,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,623148,755,34.64
4,2716884685,2716884685,AG-311-J23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1427538,1677,77.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,2623620330,640427149,WH7803,Synechococcus,,5.1B-V,ISOLATE,"Sargasso Sea, North Atlantic Ocean",Epipelagic,33.742300,-67.491300,25.0,2366980,2591,98.91
677,2623620868,638341215,WH7805,Synechococcus,,5.1B-VI,ISOLATE,North Atlantic Ocean,Pelagic,33.750000,-67.500000,,2620367,2937,99.73
678,2606217514,637000314,WH8102,Synechococcus,,5.1A-III,ISOLATE,Tropical Atlantic Ocean,Pelagic,22.495000,-65.600000,,2434428,2588,99.46
679,2606217259,637000214,MED4,Prochlorococcus,High light adapted (HL),HLI,ISOLATE,Mediterranean Sea,Marginal Sea,43.200000,6.866667,5.0,1657990,2069,99.46


# Clean Ortholog Metadata

In [6]:
# clean and join

# read in cycog data
filepath_cycogs = '../../../data/genomes/berube_et_al_2018/cycogs.tsv'
cycog_df = pd.read_csv(filepath_cycogs, sep='\t')
cycog_df['OrthologID'] =[int(id_val[6:]) for id_val in cycog_df['cycog_iid']]
# make dictionary of cycog genes
cog_dict = {}
for _, row in cycog_df.iterrows():
    cog = int(row['cycog_iid'][6:])
    for gene in row['cycog_genes'].split(','):
        cog_dict[gene] = cog
# make dataframe from cog dict       
cogs_df = pd.DataFrame.from_dict(cog_dict, dtype=int, orient='index', columns=['cycog_iid'])
# clean up data columns
cogs_df.reset_index(inplace=True)
cogs_df.rename(columns={'index':'MappingName', 'cycog_iid':'OrthologID'}, inplace=True)
cogs_df['GenomeName'], cogs_df['GeneID'] = list(zip(*cogs_df['MappingName'].str.rsplit('_', 1)))
# join in annotation information
cogs_df['Annotation'] = cogs_df.OrthologID.map(cycog_df.set_index('OrthologID')['cycog_cns_product'])
cogs_df


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit
...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein


In [7]:
# save cleaned ortholog metadata

cogs_df.to_csv('../data/berube_pro_syn_set/ortholog_metadata.csv', index=False)
cogs_df.to_csv('../../../data/mappings/2021-02-18/collated/berube_pro_syn_set/ortholog_metadata.csv', index=False)
