In [1]:
# imports

import numpy as np
import pandas as pd


# Data Sources

This notebook documents the compilation of metadata products used as input to the analyses included in the project.
- All external data were obtained from publically available sources. URLs and download instructions are included to aid in reproducing the provenance of these data. 
- All data generated by the Armbrust lab were posted to publically available resources. URLs and download instructions are included for these data as well.
- The included code combines and cleans data sets, documents the reconciliation of any discrepancies, and produces cleaned metadata files that are used as inputs for project analyses.

## Reference Genomes & CyCOGs

All reference genomes in this project were derived from the Scientific Data publication [Single cell genomes of Prochlorococcus, Synechococcus, and sympatric microbes from diverse marine environments](https://www.nature.com/articles/sdata2018154) (Berube et al., 2018). The genome sequences and annotations used in this publication are hosted by JGI on the [IMG/ProPortal](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi) and can be downloaded using the [IMG Genome Cart tool](https://img.jgi.doe.gov/cgi-bin/mer/main.cgi?section=GenomeCart&page=genomeCart)

### Publication Files
- [Table 1](https://www.nature.com/articles/sdata2018154/tables/2)
- [Table 2](https://www.nature.com/articles/sdata2018154/tables/3)

### Figshare Files

Download the following files from the [figshare site associated with Berube et al. (2018)](https://doi.org/10.6084/m9.figshare.c.4037048.v1):
- File 4: [cycogs-genomes.tsv](https://figshare.com/articles/dataset/File_4_CyCOG_taxa/6007166)
- File 5: [cycogs.tsv](https://figshare.com/articles/dataset/File_5_CyCOG_definitions/6007169)
- File 6: [cyanobacteria_phylogeny_taxa.tsv](https://figshare.com/articles/dataset/File_6_Cyanobacteria_taxa/6007175)
- File 14: [genome_assembly_summary_20180718.tsv](https://figshare.com/articles/dataset/File_14_Genome_assembly_summary/6281519)

### JGI IMG/ProPortal Files

Download the metadata associated with all ProPortal reference genomes from the [IMG/ProPortal Genome Browser](https://img.jgi.doe.gov/cgi-bin/proportal/main.cgi?section=ProPortal&page=genomeList&class=datamart)
- Use the `Table Configuration` section (bottom of the page) to display the following fields: 
    - Domain
    - Sequencing Status
    - Study Name
    - Genome Name / Sample Name
    - Sequencing Center
    - IMG Genome ID (IMG Taxon ID)
    - Genus
    - Ecosystem Subtype
    - Culture Type
    - Depth In Meters
    - Latitude
    - Longitude
    - Proportal Clade
    - Proportal Ecotype
    - Genome Size (Number of total bases)
    - Gene Count (Number of total genes)
    - CheckM2 Completeness
- Click `Select All` and `Export` to download the metadata
    - Name the file `proportal-metadata.tsv` to make it recognizeable by the following scripts
    
    
    




In [7]:
# make file for bulk download of genome & annotation data from JGI IMG/ProPortal

cycog_genomes_df = pd.read_csv('cycogsgenomes.tsv', sep='\t')

proportal_genomes_df = cycog_genomes_df[['IMG_ID', 'JGI_GENOMEPORTAL_NAME']].rename(columns={
    'IMG_ID': 'taxon_oid', 
    'JGI_GENOMEPORTAL_NAME': 'name'
})

proportal_genomes_df.to_csv('proportal-genomes.tsv', sep='\t', index=False)

proportal_genomes_df


Unnamed: 0,taxon_oid,name
0,2716884681,Uncultured_Prochlorococcus_sp._AG-311-D23
1,2716884682,Uncultured_Prochlorococcus_sp._AG-311-I02
2,2716884683,Uncultured_Prochlorococcus_sp._AG-311-I09
3,2716884684,Uncultured_Prochlorococcus_sp._AG-311-J05
4,2716884685,Uncultured_Prochlorococcus_sp._AG-311-J23
...,...,...
765,651703106,Synechococcus_phage_S-ShM2
766,651703107,Synechococcus_phage_Syn19
767,641201056,Synechococcus_phage_Syn5
768,2595698410,Synechococcus_phage_metaG-MbCM1


# Clean Genome Metadata

In [2]:
# clean genome metadata

filepath_genome_1 = '../data/berube_pro_syn_set/berube_pro_syn_set_compiled_crossref.tab'
filepath_genome_2 = '../data/berube_pro_syn_set/cycogsgenomes.tsv'
filepath_genome_3 = '../data/berube_pro_syn_set/taxontable2216_25-feb-2021.tab'
genome_df = pd.read_csv(filepath_genome_1, sep='\t')
genome_df = pd.merge(left=pd.read_csv(filepath_genome_1, sep='\t'), 
                     right=pd.read_csv(filepath_genome_2, sep='\t'), left_on='berube_taxon_oid', 
                     right_on='IMG_ID', how='inner')
genome_df = pd.merge(left=genome_df, right=pd.read_csv(filepath_genome_3, sep='\t', 
                                                       usecols=['taxon_oid', 'Proportal Clade', 'Proportal Ecotype']), 
                     left_on='updated_taxon_oid', right_on='taxon_oid')
genome_df.rename(columns={'berube_taxon_oid':'BerubeProportalID', 'updated_taxon_oid':'UpdatedProportalID', 
                          'IID':'GenomeName', 'GROUP':'Genus', 'TYPE':'ReferenceType', 
                          'Genome Size   * assembled':'GenomeSize(bp)', 'Gene Count   * assembled':'GeneCount', 
                          'Geographic Location':'IsolationLocation', 'GOLD Ecosystem Subtype':'Ecosystem', 
                          'Depth In Meters':'Depth(m)', 'Proportal Clade':'Clade', 'Proportal Ecotype':'Ecotype'}, 
                 inplace=True)
genome_df = genome_df[['BerubeProportalID', 'UpdatedProportalID', 'GenomeName', 'Genus', 'Ecotype', 'Clade', 
                       'ReferenceType', 'IsolationLocation', 'Ecosystem', 'Latitude', 'Longitude', 'Depth(m)', 
                       'GenomeSize(bp)', 'GeneCount', 'Completeness']]
# find Berube et al. 2018 prefixes to clean up depth, lat, & lon values based on published data
genome_df['Prefix'] = [name[:6] for name in genome_df['GenomeName']]
genome_df.loc[[name[:3] != 'AG-' for name in genome_df['GenomeName']], 'Prefix'] = np.nan
genome_df


Unnamed: 0,BerubeProportalID,UpdatedProportalID,GenomeName,Genus,Ecotype,Clade,ReferenceType,IsolationLocation,Ecosystem,Latitude,Longitude,Depth(m),GenomeSize(bp),GeneCount,Completeness,Prefix
0,2716884681,2716884681,AG-311-D23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,,1466304,1796,72.96,AG-311
1,2716884682,2716884682,AG-311-I02,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,,195290,271,11.16,AG-311
2,2716884683,2716884683,AG-311-I09,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,,697970,812,47.41,AG-311
3,2716884684,2716884684,AG-311-J05,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,,623148,755,34.64,AG-311
4,2716884685,2716884685,AG-311-J23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,,1427538,1677,77.90,AG-311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,2623620330,640427149,WH7803,Synechococcus,,5.1B-V,ISOLATE,"Sargasso Sea, North Atlantic Ocean",Epipelagic,33.742300,-67.491300,25.0,2366980,2591,98.91,
677,2623620868,638341215,WH7805,Synechococcus,,5.1B-VI,ISOLATE,North Atlantic Ocean,Pelagic,33.750000,-67.500000,,2620367,2937,99.73,
678,2606217514,637000314,WH8102,Synechococcus,,5.1A-III,ISOLATE,Tropical Atlantic Ocean,Pelagic,22.495000,-65.600000,,2434428,2588,99.46,
679,2606217259,637000214,MED4,Prochlorococcus,High light adapted (HL),HLI,ISOLATE,Mediterranean Sea,Marginal Sea,43.200000,6.866667,5.0,1657990,2069,99.46,


In [3]:
# check to make sure that existing lat/lon values from IMG match the published ones from Berube et al. 2018
# NOTE: it looks like for 6 biosamples the latitude is off by a couple of degrees. 
# However, it's small enough to make me think that it's just a typo
# I'll go with the published Berube et al. values

# down-select Berube SAGs from reference set
sag_df = genome_df.loc[[name[:3] == 'AG-' for name in genome_df['GenomeName']], :]
sag_df = sag_df[['Prefix', 'Latitude', 'Longitude', 'Depth(m)']].drop_duplicates().reset_index(drop=True)
# read in collection metadata from Berube et al. 2018 paper
collection_df = pd.read_excel('../data/berube_pro_syn_set/berube_collection_data.xls', sheet_name='Compiled', 
                              dtype={'Latitude':str, 'Longitude':str})
# fix weird annoying character thing
collection_df['Latitude'] = collection_df['Latitude'].str.replace('−', '-').astype(float)
collection_df['Longitude'] = collection_df['Longitude'].str.replace('−', '-').astype(float)
# join the two and check equality
check_df = pd.merge(collection_df, sag_df, on=['Prefix'])
check_df['CheckLat'] = check_df['Latitude_x'].eq(check_df['Latitude_y'])
check_df['CheckLon'] = (check_df['Longitude_x'] == check_df['Longitude_y'])
check_df


Unnamed: 0,Prefix,Biosample,Depth,Latitude_x,Longitude_x,Latitude_y,Longitude_y,Depth(m),CheckLat,CheckLon
0,AG-311,SWC-01,20.0,-20.08,-70.8,-20.08,-70.8,,True,True
1,AG-315,SWC-02,55.0,-20.08,-70.8,-20.08,-70.8,,True,True
2,AG-321,SWC-03,14.0,-23.46,-88.77,-23.46,-88.77,,True,True
3,AG-331,SWC-04,112.0,-23.46,-88.77,-23.46,-88.77,,True,True
4,AG-335,SWC-05,14.0,-26.25,-103.96,-26.25,-103.96,,True,True
5,AG-341,SWC-06,180.0,-26.25,-103.96,-26.25,-103.96,,True,True
6,AG-347,SWC-07,5.0,23.75,-158.0,22.75,-158.0,,False,True
7,AG-402,SWC-08,100.0,23.75,-158.0,22.75,-158.0,,False,True
8,AG-355,SWC-09,10.0,31.07,-64.17,31.67,-64.17,,False,True
9,AG-363,SWC-10,100.0,31.07,-64.17,31.67,-64.17,,False,True


In [4]:
# Replace Berube et al. SAG isolation lat, lon, depth values from IMG with published values

# Depth
genome_df.loc[genome_df['Prefix'].notna(), 'Depth(m)'] = \
    genome_df.loc[genome_df['Prefix'].notna()].Prefix.map(collection_df.set_index('Prefix')['Depth'])
# Latitude
genome_df.loc[genome_df['Prefix'].notna(), 'Latitude'] = \
    genome_df.loc[genome_df['Prefix'].notna()].Prefix.map(collection_df.set_index('Prefix')['Latitude'])
# Longitude
genome_df.loc[genome_df['Prefix'].notna(), 'Longitude'] = \
    genome_df.loc[genome_df['Prefix'].notna()].Prefix.map(collection_df.set_index('Prefix')['Longitude'])

genome_df.loc[genome_df['Prefix'].notna(), ['Prefix', 'Depth(m)', 'Latitude', 'Longitude']]


Unnamed: 0,Prefix,Depth(m),Latitude,Longitude
0,AG-311,20.0,-20.08,-70.8
1,AG-311,20.0,-20.08,-70.8
2,AG-311,20.0,-20.08,-70.8
3,AG-311,20.0,-20.08,-70.8
4,AG-311,20.0,-20.08,-70.8
...,...,...,...,...
533,AG-686,65.0,36.57,-158.0
534,AG-686,65.0,36.57,-158.0
535,AG-686,65.0,36.57,-158.0
536,AG-686,65.0,36.57,-158.0


In [5]:
# save cleaned up genome dataframe

genome_df = genome_df.drop(columns='Prefix')
genome_df.to_csv('../data/berube_pro_syn_set/genome_metadata.csv', index=False)
genome_df.to_csv('../../../data/mappings/2021-02-18/collated/berube_pro_syn_set/genome_metadata.csv', index=False)
genome_df


Unnamed: 0,BerubeProportalID,UpdatedProportalID,GenomeName,Genus,Ecotype,Clade,ReferenceType,IsolationLocation,Ecosystem,Latitude,Longitude,Depth(m),GenomeSize(bp),GeneCount,Completeness
0,2716884681,2716884681,AG-311-D23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1466304,1796,72.96
1,2716884682,2716884682,AG-311-I02,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,195290,271,11.16
2,2716884683,2716884683,AG-311-I09,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,697970,812,47.41
3,2716884684,2716884684,AG-311-J05,Prochlorococcus,High light adapted (HL),HLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,623148,755,34.64
4,2716884685,2716884685,AG-311-J23,Prochlorococcus,Low light adapted (LL),LLI,SAG,South Pacific Ocean,Pelagic,-20.080000,-70.800000,20.0,1427538,1677,77.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,2623620330,640427149,WH7803,Synechococcus,,5.1B-V,ISOLATE,"Sargasso Sea, North Atlantic Ocean",Epipelagic,33.742300,-67.491300,25.0,2366980,2591,98.91
677,2623620868,638341215,WH7805,Synechococcus,,5.1B-VI,ISOLATE,North Atlantic Ocean,Pelagic,33.750000,-67.500000,,2620367,2937,99.73
678,2606217514,637000314,WH8102,Synechococcus,,5.1A-III,ISOLATE,Tropical Atlantic Ocean,Pelagic,22.495000,-65.600000,,2434428,2588,99.46
679,2606217259,637000214,MED4,Prochlorococcus,High light adapted (HL),HLI,ISOLATE,Mediterranean Sea,Marginal Sea,43.200000,6.866667,5.0,1657990,2069,99.46


# Clean Ortholog Metadata

In [6]:
# clean and join

# read in cycog data
filepath_cycogs = '../../../data/genomes/berube_et_al_2018/cycogs.tsv'
cycog_df = pd.read_csv(filepath_cycogs, sep='\t')
cycog_df['OrthologID'] =[int(id_val[6:]) for id_val in cycog_df['cycog_iid']]
# make dictionary of cycog genes
cog_dict = {}
for _, row in cycog_df.iterrows():
    cog = int(row['cycog_iid'][6:])
    for gene in row['cycog_genes'].split(','):
        cog_dict[gene] = cog
# make dataframe from cog dict       
cogs_df = pd.DataFrame.from_dict(cog_dict, dtype=int, orient='index', columns=['cycog_iid'])
# clean up data columns
cogs_df.reset_index(inplace=True)
cogs_df.rename(columns={'index':'MappingName', 'cycog_iid':'OrthologID'}, inplace=True)
cogs_df['GenomeName'], cogs_df['GeneID'] = list(zip(*cogs_df['MappingName'].str.rsplit('_', 1)))
# join in annotation information
cogs_df['Annotation'] = cogs_df.OrthologID.map(cycog_df.set_index('OrthologID')['cycog_cns_product'])
cogs_df


Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit
...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein


In [7]:
# save cleaned ortholog metadata

cogs_df.to_csv('../data/berube_pro_syn_set/ortholog_metadata.csv', index=False)
cogs_df.to_csv('../../../data/mappings/2021-02-18/collated/berube_pro_syn_set/ortholog_metadata.csv', index=False)
