In [None]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import trange

import matplotlib as mpl
import matplotlib.pyplot as plt

from function.omaseq import TaxSeqFilter
from function.alignment import Alignment
from function.utilities import get_subset
from function.seqfilter import FastaExtreFilter
from function.omaseq import FetchOmaSeqBatch

# Param

In [None]:
#####CHANGE HERE#####
uniprot_id = "Q13148"
#####CHANGE HERE#####

In [None]:
#see tax
tax_ids = [7711, 7742, 32523, 40674] #chordata, vertebrata, tetrapoda, mammalia

#Extre filter param
gap_filter_num = 20
max_filter_seq = 3

#path
oma_path = Path("./output/fasta/a_oma")
grouped_path = Path("./output/fasta/b_grouped")
alied_path = Path("./output/fasta/c_alied")
extre_filtered_path = Path("./output/fasta/d_extre_filtered")

# Fetch oma seqence

In [None]:
#get paralogs by uniprot id from OMA, 
#https://omabrowser.org/oma/home/
fetchomaseq = FetchOmaSeqBatch()
fetchomaseq.get_oma_seq(uniprot_id, oma_path)

# Group by taxonomy id

In [None]:
#filter paralogs by taxonomy id
t = trange(len(tax_ids), leave=True)
for i in t:
    t.set_description(str(tax_ids[i]))
    t.refresh()

    oma_fasta_path = oma_path / "{}.fasta".format(uniprot_id)

    grouped_tax_path = grouped_path / str(tax_ids[i])
    grouped_tax_path.mkdir(parents=True, exist_ok=True)
    grouped_fasta_path = grouped_tax_path / "{}.fasta".format(uniprot_id)

    taxfilter = TaxSeqFilter(tax_ids[i])
    taxfilter.taxfilter(oma_fasta_path, grouped_fasta_path)

# Alignment

In [None]:
#alignment Biopython ClustalOmegaCommandline
#please install ClustalOmega: http://www.clustal.org/omega/
alignment = Alignment()
t = trange(len(tax_ids), leave=True)
for i in t:
    t.set_description(str(tax_ids[i]))
    t.refresh()

    grouped_tax_path = grouped_path / str(tax_ids[i])
    grouped_fasta_path = grouped_tax_path / "{}.fasta".format(uniprot_id)

    alied_tax_path = alied_path / str(tax_ids[i])
    alied_tax_path.mkdir(parents=True, exist_ok=True)
    alied_fasta_path = alied_tax_path / "{}.fasta".format(uniprot_id)

    alignment.alignment_single(grouped_fasta_path, alied_fasta_path)

# Extre filter

In [None]:
#filter some homologous, 
#i.e. some sequences in homologous have very long gap in alied fasta file, 
#while other sequences do not have gap, this is "special", and remove it
fastaextrefilter = FastaExtreFilter()
t = trange(len(tax_ids), leave=True)
for i in t:
    t.set_description(str(tax_ids[i]))
    t.refresh()

    alied_tax_path = alied_path / str(tax_ids[i])
    alied_fasta_path = alied_tax_path / "{}.fasta".format(uniprot_id)

    extre_filtered_tax_path = extre_filtered_path / str(tax_ids[i])
    extre_filtered_tax_path.mkdir(parents=True, exist_ok=True)
    extre_filtered_tax_fasta_path = extre_filtered_tax_path / "{}.fasta".format(uniprot_id)

    extre_index = fastaextrefilter.fasta_extre_filter(alied_fasta_path,
                                                      extre_filtered_tax_fasta_path,
                                                      gap_filter_num=gap_filter_num,
                                                      max_filter_seq=max_filter_seq,
                                                      )
    print(extre_index)
    