In [1]:
%matplotlib inline

In [240]:
from Bio import SeqIO
import pandas as pd

import time
import random
import os


from Bio import Entrez
Entrez.email = "adam.hockenberry@u.texas.edu"

# Introduction

The purpose of this notebook is to essentially replicate the "marker gene" analysis from the OMA browser. See: https://omabrowser.org/oma/export_markers

The webportal for this particular feature has a few limitations regarding the number of species that you can select and at various points seems to go down so I have basically downloaded all of the essential information from their downloads page (see: https://omabrowser.org/oma/current/, downloaded on 08/2019) and am replicating this feature here.

As I was coding it up, it's clear that some of my solutions / algorithms are not the most efficient especially in terms of RAM. But for the purposes of my initial analyses none of these issues cause problems. That said, future releases of OMA, should they drastically increase in size, may become problematic with this exact code as it stands

**First, I wanted to get the full species taxonomy from NCBI for each included species in the dataset**

In [3]:
df = pd.read_csv('../Data/oma-species.txt', sep='\t', skiprows=3, header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4
0,ACAM1,329726,Acaryochloris marina (strain MBIC 11017),Genome Reviews,"18-MAR-2008 (Rel. 88, Last updated, Version 2)"
1,ACCPU,522306,Accumulibacter phosphatis (strain UW-1),EBI,"15-MAY-2014 (Rel. 120, Last updated, Version 4)"
2,ACEAZ,574087,Acetohalobium arabaticum (strain ATCC 49924 / ...,EBI,"09-JAN-2014 (Rel. 119, Last updated, Version 5)"
3,ACEMN,891968,Acetomicrobium mobile (strain ATCC BAA-54 / DS...,EBI,"23-JUL-2013 (Rel. 117, Last updated, Version 3)"
4,ACEP3,634452,Acetobacter pasteurianus (strain NBRC 3283 / L...,Genome Reviews,"24-NOV-2009 (Rel. 114, Last updated, Version 2)"


In [5]:
df.shape

(2288, 5)

In [6]:
df.columns = ['OMA_code', 'TaxID', 'Scientific_name', 'Source', 'Release']
df['Taxonomy_lineage'] = ''
df.head()

Unnamed: 0,OMA_code,TaxID,Scientific_name,Source,Release,Taxonomy_lineage
0,ACAM1,329726,Acaryochloris marina (strain MBIC 11017),Genome Reviews,"18-MAR-2008 (Rel. 88, Last updated, Version 2)",
1,ACCPU,522306,Accumulibacter phosphatis (strain UW-1),EBI,"15-MAY-2014 (Rel. 120, Last updated, Version 4)",
2,ACEAZ,574087,Acetohalobium arabaticum (strain ATCC 49924 / ...,EBI,"09-JAN-2014 (Rel. 119, Last updated, Version 5)",
3,ACEMN,891968,Acetomicrobium mobile (strain ATCC BAA-54 / DS...,EBI,"23-JUL-2013 (Rel. 117, Last updated, Version 3)",
4,ACEP3,634452,Acetobacter pasteurianus (strain NBRC 3283 / L...,Genome Reviews,"24-NOV-2009 (Rel. 114, Last updated, Version 2)",


**Slowly iterate through and add all lineage information to each record**

Commented here because this should only have to be run one time and will take a day or so to complete given my `time.sleep` call (which is to hopefully evade getting blocked)

In [16]:
# for index in df.index[:]:
#     print(index)
#     taxid = df.loc[index]['TaxID']
#     handle = Entrez.efetch(db="Taxonomy", id=str(taxid), retmode="xml")
#     records = Entrez.read(handle)
#     handle.close()
#     assert len(records) == 1
#     tax_lineage = []
#     for i in records[0]['LineageEx']:
#         tax_lineage.append(i['TaxId'])
#     df.at[index, 'Taxonomy_lineage'] = ';'.join(tax_lineage)
#     time.sleep(random.randint(1, 45))

In [18]:
df.head()

Unnamed: 0,OMA_code,TaxID,Scientific_name,Source,Release,Taxonomy_lineage
0,ACAM1,329726,Acaryochloris marina (strain MBIC 11017),Genome Reviews,"18-MAR-2008 (Rel. 88, Last updated, Version 2)",131567;2;1783272;1798711;1117;1890424;1890429;...
1,ACCPU,522306,Accumulibacter phosphatis (strain UW-1),EBI,"15-MAY-2014 (Rel. 120, Last updated, Version 4)",131567;2;1224;28216;119066;327159;327160
2,ACEAZ,574087,Acetohalobium arabaticum (strain ATCC 49924 / ...,EBI,"09-JAN-2014 (Rel. 119, Last updated, Version 5)",131567;2;1783272;1239;186801;53433;53434;28186...
3,ACEMN,891968,Acetomicrobium mobile (strain ATCC BAA-54 / DS...,EBI,"23-JUL-2013 (Rel. 117, Last updated, Version 3)",131567;2;508458;649775;649776;649777;49894;97477
4,ACEP3,634452,Acetobacter pasteurianus (strain NBRC 3283 / L...,Genome Reviews,"24-NOV-2009 (Rel. 114, Last updated, Version 2)",131567;2;1224;28211;204441;433;434;438;481145


In [19]:
df.tail()

Unnamed: 0,OMA_code,TaxID,Scientific_name,Source,Release,Taxonomy_lineage
2283,ZYMMA,555217,Zymomonas mobilis subsp. mobilis (strain ATCC ...,EBI,"22-SEP-2011 (Rel. 110, Last updated, Version 2)",131567;2;1224;28211;204457;41297;541;542;120045
2284,ZYMMN,622759,Zymomonas mobilis subsp. mobilis (strain NCIMB...,EBI,"12-DEC-2013 (Rel. 119, Last updated, Version 3)",131567;2;1224;28211;204457;41297;541;542;120045
2285,ZYMMO,264203,Zymomonas mobilis subsp. mobilis (strain ATCC ...,EBI,"02-FEB-2014 (Rel. 119, Last updated, Version 10)",131567;2;1224;28211;204457;41297;541;542;120045
2286,ZYMMT,579138,Zymomonas mobilis subsp. pomaceae (strain ATCC...,EBI,"18-JUL-2013 (Rel. 117, Last updated, Version 3)",131567;2;1224;28211;204457;41297;541;542;120044
2287,ZYMTR,1047171,Zymoseptoria tritici,EnsemblGenomes,Ensembl Fungi 28; MG2; 29-JUL-2015,131567;2759;33154;4751;451864;4890;716545;1475...


**And now write that information to a new file**

In [202]:
# df.to_csv('../Data/oma-species-full.tsv', sep='\t', index=False)

# Use new taxonomy to gather specific species sub-sets

For my purposes:
1. Basidiomycota (5204)
2. Ascomycota (4890)
3. Deuterostomia (33511)
4. Protostomia (33317)
5. Viridiplantae (33090)

And later I might find these to be interesting:
1. Alphaproteobacteria (28211)
2. Betaproteobacteria (28216)
3. Gammaproteobacteria (1236)
4. Deltaproteobacteria (28221)
5. Epsilonproteobacteria (29547)

In [34]:
taxa_code = 1236
df[df['Taxonomy_lineage'].str.contains(';{};'.format(taxa_code))].shape

(361, 6)

**Finding OMA groups that fit my needs**

For each taxonomic group that I care about, take the huge `oma-groups.txt` file and remove any rows where `XX%` of the species that I care about do not appear. Currently setting `XX` to 0.5, thus this will write separate files that only include the rows where half of the species in the taxa that I care about appear. 

In [148]:
taxa_code = 33090
taxa_of_interest = set(df[df['Taxonomy_lineage'].str.contains(';{};'.format(toi))]['OMA_code'])
n_taxa_thresh = round(len(taxa_of_interest)*0.5)

In [149]:
new_lines = []
with open('../Data/oma-groups.txt', 'r') as infile:
    for line in infile.readlines()[3:]:
        split_line = line.split('\t')
        group = split_line[0]
        species = set([i[:5] for i in split_line[1:]])
        intersect = taxa_of_interest.intersection(species)
        if len(intersect) > n_taxa_thresh:
            new_lines.append(line)
print(len(new_lines))

8733


In [150]:
with open('../Data/oma-groups-{}.txt'.format(toi), 'w') as outfile:
    for line in new_lines:
        outfile.write(line)

**Read those other dataframes to find OMA groups that fit criteria for all of the different taxa that I care about**

In [260]:
# required = ('5204', '4890')
# required = ('5204', '4890', '33511', '33317')
required = ('5204', '4890', '33511', '33317', '33090')

taxa_dict = {}
for taxid in required:
    taxa_dict[taxid] = []
    with open('../Data/oma-groups-{}.txt'.format(taxid), 'r') as infile:
        for line in infile.readlines():
            split_line = line.split('\t')
            group = split_line[0]
            taxa_dict[taxid].append(group)
    taxa_dict[taxid] = set(taxa_dict[taxid])

In [261]:
for i in required:
    print(i, len(taxa_dict[i]))

5204 3625
4890 4152
33511 14613
33317 3975
33090 8733


In [262]:
group_intersection = taxa_dict[required[0]].intersection(*list(taxa_dict.values()))
print(len(group_intersection))

944


**JUST getting the taxa that I care about now for each of these OMA groups**

First get the species names that fit the bill

In [263]:
species_of_interest = []
for taxid in required:
    species_of_interest.extend(df[df['Taxonomy_lineage'].str.contains(';{};'.format(taxid))]['OMA_code'])
print(len(species_of_interest), len(list(set(species_of_interest))))

363 363


**Now isolate the full codes that I care about (species + geneid)**

In [264]:
groups_dict = {}
with open('../Data/oma-groups-{}.txt'.format(required[0]), 'r') as infile:
    for line in infile.readlines():
        split_line = line.strip().split('\t')
        group = split_line[0]
        if group in group_intersection:
            groups_dict[group] = [i for i in split_line[1:] if i[:5] in species_of_interest]

In [265]:
len(groups_dict.keys())

944

In [266]:
groups_dict[list(groups_dict.keys())[1]][:10]

['SCHCR04229',
 'SCHJY01101',
 'SCHOY04167',
 'SCHPO04533',
 'ARTOA00963',
 'PYROM06047',
 'TUBMM01011',
 'ASPAC05289',
 'ASPCL02675',
 'NEOFI09535']

In [267]:
inverse_groups_dict = {}
for key, val in groups_dict.items():
    for geneid in val:
        inverse_groups_dict[geneid] = key

**Find the sequences (SLOWWWWW)**

We're doing a line-by-line iteration of a ~6GB file (~15,000,000 sequences) so this will take a while and that's to be expected.

And this code is really not the best but it runs in a reasonable few minutes so all things considered seems to serve its purpose. But all told these manipulations are just eating up RAM and doing a lot of brute force searches that could/should surely be streamlined

In [268]:
seqs = SeqIO.parse('../Data/oma-seqs.fa', 'fasta')

In [269]:
fastas_dict = {}
for group in groups_dict.keys():
    fastas_dict[group] = []
counter = 0
for i in seqs:
    try:
        fastas_dict[inverse_groups_dict[i.name]].append(i)
    except:
        pass
    counter += 1
#     if counter == 100000:
#         break

In [270]:
for key,val in fastas_dict.items():
    assert len(val) == len(groups_dict[key])
#     print(key, len(val), len(groups_dict[key]))

**Write out the groups**

In [271]:
save_dir = '../Data/OMA_orthologs/{}/'.format('_'.join(required))
if not os.path.exists(save_dir):
    os.makedirs('../Data/OMA_orthologs/{}/'.format('_'.join(required)))

In [272]:
for key,val in fastas_dict.items():
    assert len(val) == len(groups_dict[key])
    with open(save_dir+key+'.fasta', 'w') as outfile:
        SeqIO.write(val, outfile, 'fasta')

# After all of this I should have fasta files for all the OMA groups / species sets that I want to carry forward for further analyses