In [1]:
import pandas as pd
from shutil import rmtree
import re

In [2]:
def seqs_to_fasta(headers:list,seqs:list,outpath:str):
    
    """turning headers and sequences to fasta file and save to outpath"""
    with open(outpath,"w") as out:
        for header,seq in zip(headers,seqs):
            out.write(f">{header}\n{seq}\n")


In [3]:
def muscle_align(inpath:str,outpath:str):
    
    """align inpath fasta file using muscle and save to outpath"""
    try:os.remove(outpath)
    except:pass
    !muscle -in $inpath -out $outpath -diags


In [4]:
def muscle_align_on_profile(inpath1:str,inpath2:str,outpath:str):
    
    """align inpath2 sequences onto inpath1 profile using muscle, and save to outpath"""
    
    try:os.remove(outpath)
    except:pass
    ! muscle -profile -in1 $inpath1 -in2 $inpath2 -out $outpath


In [5]:
def clustalo_align(inpath:str,outpath:str,force:bool=False):
    """align inpath fasta file using clustalo and save to outpath"""
    try:os.remove(outpath)
    except:pass
    if force:
        !clustalo -i $inpath -o $outpath --threads 12 -v --force
    else:
        !clustalo -i $inpath -o $outpath --threads 12 -v


In [6]:
def clustalo_align_on_profile(inpath1:str,inpath2:str,outpath:str,force:bool=False):
    """align inpath fasta file using clustalo and save to outpath"""
    try:os.remove(outpath)
    except:pass
    if force:
        !clustalo --profile1 $inpath1 --profile2 $inpath2 -o $outpath --threads 12 -v --is-profile --force
    else:
        !clustalo --profile1 $inpath1 --profile2 $inpath2 -o $outpath --threads 12 -v --is-profile


In [7]:
def parse_fasta(fasta_path):
    """in: file fasta_path, out: dict with fasta headers as keys and sequences as values"""
    with open(fasta_path,"r") as f:
        
        headers,seqs=[],[]
        for elt in f.read().split(">"):
            if elt!="":
                pattern = '^.*'
                header=re.match(pattern, elt, re.MULTILINE).group(0)
                headers.append(header)
                
                elt=elt.replace(header,"")
                seq=elt.replace("\n","")
                seqs.append(seq)
        
        seqs=dict(zip(headers,seqs))
        return(seqs)
    


In [8]:
def import_alignment_as_df(fasta_alignment_path):
    """import and converts fasta alignment file into a dataframe"""
    
    aln=parse_fasta(fasta_alignment_path)
    
    headers,sequences=[],[]
    for header,seq in aln.items():
        headers.append(header)
        sequences.append([aa for aa in seq])

    df = pd.DataFrame(sequences,index=headers)
    return(df)
