In [None]:
import os
import sys
import pickle
import requests
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import trange
from tqdm.notebook import tqdm

In [None]:
# git for functions loading and work path finding
import git

repo = git.Repo('.', search_parent_directories=True)
work_path = Path(repo.working_tree_dir)
if str(work_path) not in sys.path:
    sys.path.append(str(work_path))

In [None]:
from function.omaseq import FetchOmaSeqBatch
from function.omaseq import TaxSeqFilter
from function.utilities import fasta_seq_aa_check
from function.utilities import get_fasta_filename
from function.alignment import Alignment

In [None]:
# please specify the path for saving fsta files
oma_path = work_path / "1_prepare_training_data" / "oma_all"

# path for downloaded fasta from OMA
a_oma_path = oma_path / "a_oma"

# path for checked 20 amino acid chars
b_seqaa_checked_path = oma_path / "b_seqaa_checked"

# path for grouped homologs by taxonomy id
c_grouped_path = oma_path / "c_grouped"

# path for sequence alignment fasta
d_alignment_path = oma_path / "d_alignment"

# a_oma: get human protein homologs (19909 in total) from OMA

In [None]:
# get human OMA Groups, and save to 1-1_oma_group_ids.pkl

from omadb import Client

# c = Client()
# human_genome = c.genomes.proteins('HUMAN',progress=True)
# human_genome = human_genome.as_dataframe()

# oma_group_ids = human_genome['oma_group'].unique().tolist()
# oma_group_ids.remove(0)

# with open(str(work_path / "1_prepare_training_data" / "1-1_oma_group_ids.pkl"), 'wb') as f:
#     pickle.dump(oma_group_ids, f)

In [None]:
# load oma group id
with open(str(work_path / "1_prepare_training_data" / "1-1_oma_group_ids.pkl"), 'rb') as f:
    oma_group_ids = pickle.load(f)

In [None]:
fetchomaseq = FetchOmaSeqBatch()
failed_list = []

# get all homologs by OMA group ID, and save to fasta
t = trange(len(oma_group_ids), leave=True)
for i in t:
    t.set_description(str(oma_group_ids[i]))
    t.refresh()
    try:
        fetchomaseq.get_oma_seq(oma_group_ids[i], a_oma_path)
    except:
        print("{} failed".format(oma_group_ids[i]))
        failed_list.append(oma_group_ids[i])

# b_oma_seqaa_checked: check if all sequences are valid 20 aa chars

In [None]:
fasta_pathlist = list(Path(oma_path).rglob("*.fasta"))

t = trange(len(fasta_pathlist), leave=True)
for i in t:
    file_name = get_fasta_filename(fasta_pathlist[i])

    t.set_description(file_name)
    t.refresh()

    fasta_path = fasta_pathlist[i]
    checked_fasta_path = b_seqaa_checked_path / "{}.fasta".format(file_name)

    fasta_seq_aa_check(fasta_path, checked_fasta_path)

# c_grouped: filter seqeunces to new fasta by tax id

In [None]:
tax_id = 7711  # taxonomy Chordata, ref: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?

In [None]:
fasta_pathlist = list(Path(b_seqaa_checked_path).rglob("*.fasta"))
taxfilter = TaxSeqFilter(tax_id)

t = trange(len(fasta_pathlist), leave=True)
for i in t:
    oma_fasta_path = fasta_pathlist[i]
    file_name = get_fasta_filename(oma_fasta_path)

    t.set_description(file_name)
    t.refresh()

    c_grouped_tax_path = c_grouped_path / str(tax_id)
    c_grouped_tax_path.mkdir(exist_ok=True)
    grouped_fasta_path = c_grouped_tax_path / "{}.fasta".format(file_name)

    taxfilter.taxfilter(oma_fasta_path, grouped_fasta_path)

# d_alignment: sequence alignment

In [None]:
alignment = Alignment()

In [None]:
tax_id = 7711

c_grouped_path_tax = c_grouped_path / str(tax_id)
d_alignment_path_tax = d_alignment_path / "{}_alied".format(str(tax_id))

failed_list = alignment.alignment_path(c_grouped_path_tax, d_alignment_path_tax, delete=False)