In [1]:
# imports

import numpy as np
import pandas as pd


# Data Sources

This notebook documents the compilation of metadata products used as input to the analyses included in the project.
- All external data were obtained from publically available sources. URLs and download instructions are included to aid in reproducing the provenance of these data. 
- All data generated by the Armbrust lab were posted to publically available resources. URLs and download instructions are included for these data as well.
- The included code combines and cleans data sets, documents the reconciliation of any discrepancies, and produces cleaned metadata files that are used as inputs for project analyses.

## Reference Genomes & CyCOGs

All reference genomes in this project were derived from the Scientific Data publication [Single cell genomes of Prochlorococcus, Synechococcus, and sympatric microbes from diverse marine environments](https://www.nature.com/articles/sdata2018154) (Berube et al., 2018). The genome sequences and annotations used in this publication are hosted by JGI on the [IMG/ProPortal](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi) and can be downloaded using the [IMG Genome Cart tool](https://img.jgi.doe.gov/cgi-bin/mer/main.cgi?section=GenomeCart&page=genomeCart)

### Figshare Files

Download the following files from the [figshare site associated with Berube et al. (2018)](https://doi.org/10.6084/m9.figshare.c.4037048.v1):
- File 4: [cycogs-genomes.tsv](https://figshare.com/articles/dataset/File_4_CyCOG_taxa/6007166)
- File 5: [cycogs.tsv](https://figshare.com/articles/dataset/File_5_CyCOG_definitions/6007169)
- File 14: [genome_assembly_summary_20180718.tsv](https://figshare.com/articles/dataset/File_14_Genome_assembly_summary/6281519)



# CACHE

I don't think I need this stuff, but stashing here in case

### Publication Files
- [Table 1](https://www.nature.com/articles/sdata2018154/tables/2)
- [Table 2](https://www.nature.com/articles/sdata2018154/tables/3)


### JGI IMG/ProPortal Files

Download the metadata associated with all ProPortal reference genomes from the [IMG/ProPortal Genome Browser](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi?section=ProPortal&page=genomeList&class=datamart)
- Use the `Table Configuration` section (bottom of the page) to display the following fields: 
    - Domain
    - Sequencing Status
    - Study Name
    - Genome Name / Sample Name
    - Sequencing Center
    - IMG Genome ID (IMG Taxon ID)
    - Genus
    - Ecosystem Subtype
    - Culture Type
    - Depth In Meters
    - Latitude
    - Longitude
    - Proportal Clade
    - Proportal Ecotype
    - Genome Size (Number of total bases)
    - Gene Count (Number of total genes)
    - CheckM2 Completeness
- Click `Select All` and `Export` to download the metadata
    - Name the file `proportal-metadata.tsv` to make it recognizeable by the following scripts
    

# Clean Genome Metadata

In [2]:
# clean genome metadata

# combine genome metadata from Berube et al. 2018 publication
genome_df = pd.merge(
    left=pd.read_csv('metadata/cycogsgenomes.tsv', sep='\t'), 
    right=pd.read_csv('metadata/genome_assembly_summary_20180718.tsv', sep='\t'), 
    left_on='IMG_ID',
    right_on='img_genome_id', 
    how='left'
)

# add updated clade assignments
genome_df['clade'] = genome_df['IMG_ID'].map(
    pd.read_csv('metadata/updated-genome-clades.csv', sep=',').set_index('img_genome_id')['clade']
)

# standardize field names
genome_df.rename(
    columns={
        'IMG_ID':'genomeid', 
        'IID':'genomename', 
        'GROUP':'group', 
        'TYPE':'type', 
        'Completeness':'completeness', 
    }, 
    inplace=True
)

# add virocell field
genome_df['virocell'] = (genome_df['usage_notes'] == 'likely virocell')

# down select only needed fields & sort
genome_df = genome_df[['genomeid', 'genomename', 'type', 'group', 'clade', 'virocell', 'completeness']].sort_values(
    ['group', 'clade', 'type', 'completeness'], ascending=[True, True, False, False]
).reset_index(drop=True)

# save cleaned up genome dataframe
genome_df.to_csv('metadata/genome-metadata.csv', index=False)

genome_df


Unnamed: 0,genomeid,genomename,type,group,clade,virocell,completeness
0,2716884698,AG-316-L16,SAG,Prochlorococcus,AMZ-II,False,20.69
1,2716884700,AG-316-N23,SAG,Prochlorococcus,AMZ-II,False,20.69
2,2716884701,AG-316-P23,SAG,Prochlorococcus,AMZ-II,False,12.07
3,2716884699,AG-316-L21,SAG,Prochlorococcus,AMZ-II,False,10.34
4,2716884642,AG-316-A05,SAG,Prochlorococcus,AMZ-II,False,6.90
...,...,...,...,...,...,...,...
765,651703106,S-ShM2,ISOLATE,Virus,,False,0.00
766,651703107,Syn19,ISOLATE,Virus,,False,0.00
767,641201056,Syn5,ISOLATE,Virus,,False,0.00
768,2595698410,metaG-MbCM1,ISOLATE,Virus,,False,0.00


# Clean Ortholog Metadata

In [3]:
# clean and join

# read in cycog data
filepath_cycogs = 'metadata/cycogs.tsv'
cycog_df = pd.read_csv(filepath_cycogs, sep='\t')
cycog_df['cycogid'] =[int(id_val[6:]) for id_val in cycog_df['cycog_iid']]

# make dictionary of cycog genes
cog_dict = {}
for _, row in cycog_df.iterrows():
    cog = int(row['cycog_iid'][6:])
    for gene in row['cycog_genes'].split(','):
        cog_dict[gene] = cog

# make dataframe from cog dict       
cogs_df = pd.DataFrame.from_dict(cog_dict, dtype=int, orient='index', columns=['cycog_iid'])

# clean up data columns
cogs_df.reset_index(inplace=True)
cogs_df.rename(columns={'index':'mappingname', 'cycog_iid':'cycogid'}, inplace=True)
cogs_df['genomename'], cogs_df['geneid'] = list(zip(*cogs_df['mappingname'].str.rsplit(pat='_', n=1)))

# join in annotation information
cogs_df['annotation'] = cogs_df.cycogid.map(cycog_df.set_index('cycogid')['cycog_cns_product'])

# save cleaned ortholog metadata
cogs_df.to_csv('metadata/ortholog-metadata.csv', index=False)

cogs_df


Unnamed: 0,mappingname,cycogid,genomename,geneid,annotation
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit
...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein
