In [None]:
# investigate conservation of exonic regions across annotated genomes

In [1]:
import os
import re
import sys
import csv
import time
import shutil
import random
import sqlite3
import requests
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO, AlignIO

In [2]:
base_dir = Path.cwd().parent.parent
data_dir = base_dir / 'HIV_Atlas_Creation/data'

reference_data = {
    11676: {"accid": "K03455.1",
            "organism": "HIV-1",
            "fasta": data_dir / "reference/K03455.1.fasta",
            "gtf": data_dir / "reference/K03455.1.gtf",
            "description": 'Human immunodeficiency virus type 1 (HXB2), complete genome; HIV1/HTLV-III/LAV reference genome'}
}
reference_ids = set([x["accid"] for x in reference_data.values()])

sequence_dir = data_dir / 'sequences'
assert sequence_dir.exists(), f"sequence_dir does not exist: {sequence_dir}"

annotation_dir = data_dir / 'annotation'
assert annotation_dir.exists(), f"annotation_dir does not exist: {annotation_dir}"

outdir = base_dir / 'HIV_Atlas_Experiments/results/exon_conservation'
outdir.mkdir(exist_ok=True)

accession_list_fname = data_dir / 'complete_sequences.2022.accessions'
summary_file =data_dir / "download.summary.tsv"

annotation_log_fname = data_dir / 'annotation/log.txt'
# load the annotation log file to extract relevant data into the database tsv
log_df = pd.read_csv(annotation_log_fname)

complete_log_df = log_df[log_df["status"] == "complete"].reset_index(drop=True)
complete_log_df.head()

Unnamed: 0,accid,organism,status,genome_score,junction_score,transcript_score,miniprot_score,guide_score
0,MH705157.1,HIV-1,complete,0.871767,0.944444,0.79909,0.6616,0.559152
1,DQ396400.1,HIV-1,complete,0.901976,0.958333,0.845619,0.661978,0.720496
2,KU168256.1,HIV-1,complete,0.878658,0.9375,0.819815,0.55941,0.71985
3,MH705158.1,HIV-1,complete,0.874285,0.958333,0.790237,0.407212,0.753737
4,MH705151.1,HIV-1,complete,0.82361,0.914352,0.732868,0.391839,0.564241


In [3]:
soft_dir = Path().resolve().parent / 'soft/vira'  # Note: removed extra 'vira'
sys.path.append(str(soft_dir))

from vira.classes.txgroup import Transcriptome, Gene, Bundle
from vira.classes.transcript import Transcript, Object

In [4]:
mafft_bin = "mafft"

In [9]:
# file declarations
exons_data_fname = outdir / "exons.tsv"

In [5]:
# preapre genome sequence data
qry_genome = {rec.id: rec.seq for rec in SeqIO.parse(reference_data[11676]["fasta"], "fasta")}
assert len(qry_genome) == 1
qry_genome = qry_genome[list(qry_genome.keys())[0]]

# load acceptors and donors from the reference
qry_tome = Transcriptome()
qry_tome.build_from_file(reference_data[11676]["gtf"])
donors = {}
acceptors = {}
for qry_tx in qry_tome:
    for it in qry_tx.introns_it():
        donors[it[0]] = None
        acceptors[it[1]] = None

# load acceptor to donor pairing from the reference for each unique exon
exons = {}
for qry_tx in qry_tome:
    for exon in qry_tx.exons:
        acceptor = exon[0] if exon[0] in acceptors else 'start'
        donor = exon[1] if exon[1] in donors else 'end'
        exon_seq = qry_genome[exon[0]-1:exon[1]-1]
        exons[(acceptor, donor)] = {"query_coordinates": (exon[0], exon[1]), "query_sequence": exon_seq, "targets": {}}
        
        
with open(summary_file,"r") as inFP:
    reader = csv.reader(inFP, delimiter="\t")
    # skip header
    next(reader)
    for lno, (accid, is_ref_genome, organism, taxid, fasta, gff, is_valid, ref_start, ref_end) in enumerate(reader):
        if accid in reference_ids:
            continue

        if (annotation_dir / accid / f"{accid}.vira.gtf").exists():
            # load line from the annotation log to get the score
            if len(log_df[log_df["accid"]==accid]) == 0:
                print(f"Warning: {accid} not found in log file, skipping")
                continue
            
            log_row = log_df[log_df["accid"]==accid]
            assert len(log_row) == 1, f"Error: multiple rows found for {accid}"

            status = log_row["status"].values[0]
            if not status == "complete":
                print(f"Warning: {accid} is not complete, skipping")
                continue
            
            if not int(taxid) == 11676:
                continue

            acc_outdir = annotation_dir / accid
            trg_fasta_fname = sequence_dir / f"{accid}.fasta"
            trg_gtf_fname = acc_outdir / f"{accid}.vira.gtf"

            trg_genome = {rec.id: rec.seq for rec in SeqIO.parse(trg_fasta_fname, "fasta")}
            assert len(trg_genome) == 1
            trg_genome = trg_genome[list(trg_genome.keys())[0]]

            for k,v in exons.items():
                v["targets"][accid] = {"target_coordinates": None, "target_sequence": None}

            # load map of donors and acceptors between query and target
            junction_stats_fname = acc_outdir / f"{accid}.vira.junction_stats.tsv"
            with open(junction_stats_fname, "r") as inFP:
                reader = csv.reader(inFP, delimiter="\t")
                # skip header
                next(reader)
                for lno, (position, query_position, site_type, map_consistency, sequence, query_sequence) in enumerate(reader):
                    position = int(position)
                    query_position = int(query_position)
                    if site_type == "donor":
                        if query_position+1 in donors:
                            donors[query_position+1] = position
                    elif site_type == "acceptor":
                        if query_position in acceptors:
                            acceptors[query_position] = position

            # skip if any of the query donors or acceptors are note matched
            if None in acceptors.values() or None in donors.values():
                print(f"Skipping {accid} due to missing acceptors or donors")
                continue

            # for every query exon we need to extract the corresponding target exon
            # there may be additional introns in the middle of the target -  we shall treat them as gaps

            # hw do I get start/end coordinates though?
            # iterate over target transcriptome and get min/max transcript coordinates
            trg_tome = Transcriptome()
            trg_tome.build_from_file(trg_gtf_fname)
            trg_start = np.inf
            trg_end = -np.inf
            for trg_tx in trg_tome:
                trg_start = min(trg_start, trg_tx.start)
                trg_end = max(trg_end, trg_tx.end)

            # load sequence data for the target
            for (qry_exon_start,qry_exon_end), exon_data in exons.items():
                trg_exon_start = trg_start if qry_exon_start == 'start' else acceptors[qry_exon_start]
                trg_exon_end = trg_end if qry_exon_end == 'end' else donors[qry_exon_end]
                
                # get the sequence for the target exon
                exon_data["targets"][accid]["target_coordinates"] = (trg_exon_start, trg_exon_end)
                exon_data["targets"][accid]["target_sequence"] = trg_genome[trg_exon_start-1:trg_exon_end-1]
                
exons

{(5977, 'end'): {'query_coordinates': (5977, 9636),
  'query_sequence': Seq('GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGC...TTC'),
  'targets': {'MH705157.1': {'target_coordinates': (5976, 9687),
    'target_sequence': Seq('GAAGAAGCGGAGACCCCGACGAGGAACTTCTCAAGGCAGTAAGGATCATCAAAA...GTG')},
   'DQ396400.1': {'target_coordinates': (5417, 9110),
    'target_sequence': Seq('GAAGAAGCGGAGACGACGTCGAGGATCTCCTCCAAGCAGTAAGGATCATCAAAA...ATA')},
   'KU168256.1': {'target_coordinates': (5978, 9704),
    'target_sequence': Seq('GAAGAAGCGGAGACGCCGACGCGGAACTCCTCACAGCAGTAAGAATCATCAAAA...TTC')},
   'MH705158.1': {'target_coordinates': (5983, 9613),
    'target_sequence': Seq('GAAGAAGCGGAGACAGCGACGAGCAACTCCTCACAGCAGTAAGGATCATCAAAA...GAG')},
   'MH705151.1': {'target_coordinates': (5963, 9689),
    'target_sequence': Seq('GAAGAAGCGGAGACCCCGACGAAGAACTCCTCAGGGCAGTAAGGATCATCAAAA...AGT')},
   'MH705153.1': {'target_coordinates': (5811, 9499),
    'target_sequence': Seq('GAAGAAGCGGAAACCCCGACGAGGAACTCCT

In [10]:
# save exon data into a file such that we can load it later
with open(exons_data_fname, 'w', newline='') as outFP:
    writer = csv.writer(outFP, delimiter='\t')
    writer.writerow([
        'query_start', 'query_end', 'query_sequence',
        'target_name', 'target_start', 'target_end', 'target_sequence'
    ])
    for (query_start, query_end), query_data in exons.items():
        (qc_start,qc_end) = query_data['query_coordinates']
        query_seq = query_data['query_sequence']
        for target_name, target_data in query_data['targets'].items():
            (tc_start,tc_end) = target_data['target_coordinates']
            target_seq = target_data['target_sequence']
            writer.writerow([
                qc_start, qc_end, query_seq,
                target_name, tc_start, tc_end, target_seq
            ])

In [7]:
# write a fasta file of sequences for each exon to be aligned with mafft
for (acc,donor), exon_data in exons.items():
    out_ref_fname = outdir / f"{acc}_{donor}_{reference_data[11676]['accid']}.fasta"
    with open(out_ref_fname, "w") as outFP:
        outFP.write(f">{reference_data[11676]['accid']}\n{exon_data['query_sequence']}\n")
    outfname = outdir / f"{acc}_{donor}.fasta"
    with open(outfname, "w") as outFP:
        for target, target_data in exon_data["targets"].items():
            outFP.write(f">{target}\n{target_data['target_sequence']}\n")
            
    # run mafft
    # mafft --6merpair --addfragments othersequences referencesequence > output
    cmd = [mafft_bin, '--6merpair', '--addfragments', str(outfname), str(out_ref_fname)]
    msa_fname = outdir / f"{acc}_{donor}.msa.fasta"
    print(" ".join(cmd) + " > " + str(msa_fname))
    with open(msa_fname, 'w') as f:
        subprocess.run(cmd, stdout=f)

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 10199 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_744.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_744_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_744.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 290 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 10216 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_5464.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_5464_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_5464.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 137 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 10219 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 10251 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_4963.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_4963_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_4963.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 89 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077          

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/start_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 18902 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/8379_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/8379_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/8379_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 3021 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077        

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5977_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 163 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5961_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 180 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5955_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 183 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5937_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 215 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5778_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5778_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5778_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 467 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077         

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/4913_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 12112 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_6046.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_6046_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_6046.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 1151 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077        

mafft --6merpair --addfragments /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_end.fasta /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_end_K03455.1.fasta > /ccb/salz8-3/avaraby1/HIV_Atlas/HIV_Atlas_Experiments/results/exon_conservation/5390_end.msa.fasta


nadd = 2077
ppenalty_ex = -10
nthread = 0
blosum 62 / kimura 200
sueff_global = 0.100000
norg = 1
njobc = 2
generating a scoring matrix for nucleotide (dist=200) ... done


Making a distance matrix ..

There are 11187 ambiguous characters
    1 / 1
done.

fTEP 499 / 2077                    
STEP 500 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 600 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 700 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 800 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 900 / 2077                    
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
STEP 1000 / 2077       

In [11]:
exons_df = pd.read_csv(exons_data_fname, sep="\t")
exons_df.head()

Unnamed: 0,query_start,query_end,query_sequence,target_name,target_start,target_end,target_sequence
0,5977,9636,GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACT...,MH705157.1,5976,9687,GAAGAAGCGGAGACCCCGACGAGGAACTTCTCAAGGCAGTAAGGAT...
1,5977,9636,GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACT...,DQ396400.1,5417,9110,GAAGAAGCGGAGACGACGTCGAGGATCTCCTCCAAGCAGTAAGGAT...
2,5977,9636,GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACT...,KU168256.1,5978,9704,GAAGAAGCGGAGACGCCGACGCGGAACTCCTCACAGCAGTAAGAAT...
3,5977,9636,GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACT...,MH705158.1,5983,9613,GAAGAAGCGGAGACAGCGACGAGCAACTCCTCACAGCAGTAAGGAT...
4,5977,9636,GAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACT...,MH705151.1,5963,9689,GAAGAAGCGGAGACCCCGACGAAGAACTCCTCAGGGCAGTAAGGAT...


In [None]:
# for each exon
# for exach reference position
# compute the conservation score

def compute_consensus_conservation(msa_file):
    """
    Compute conservation scores based on the fraction of sequences matching 
    the reference (first sequence) at each position.
    
    Parameters:
    -----------
    msa_file : str
        Path to the MSA file in FASTA format
    
    Returns:
    --------
    numpy.ndarray
        Array of conservation scores for each position in the reference sequence
    """
    alignment = AlignIO.read(msa_file, "fasta")
    reference = str(alignment[0].seq)
    scores = []
    
    for i in range(len(reference)): # iterate over reference positions
        if reference[i] == '-':
            continue
        
        col = [str(record.seq)[i] for record in alignment]
        ref_nt = reference[i]

        num_entries = len(col)
        
        matches = sum(1 for nt in col if nt == ref_nt) # matchines
        score = matches / num_entries
        
        scores.append(score)
    
    return np.array(scores)

all_scores = {}

# write a fasta file of sequences for each exon to be aligned with mafft
for (acc,donor), exon_data in exons.items():
    msa_fname = outdir / f"{acc}_{donor}.msa.fasta"
    print(f"Computing conservation for {acc}_{donor}")
    scores = compute_consensus_conservation(msa_fname)
    all_scores[(acc,donor)] = scores

all_scores

Computing conservation for 5977_end
Computing conservation for start_744
Computing conservation for 5961_end


Computing conservation for 5390_5464
Computing conservation for 5955_end
Computing conservation for 5937_end
Computing conservation for 4913_4963
Computing conservation for start_end
Computing conservation for 8379_end
Computing conservation for 5977_6046
Computing conservation for 5961_6046
Computing conservation for 5955_6046
Computing conservation for 5937_6046
Computing conservation for 5778_6046
Computing conservation for 4913_end


In [None]:
# save the scores to a file
scores_fname = outdir / "exon_conservation_scores.tsv"
with open(scores_fname,"w") as outFP:
    for (acc,donor), scores in all_scores:
        outFP.write(f"{acc}\t{donor}\t{','.join(scores)}\n")

In [None]:
for (acc,donor), scores in all_scores:
    # Plot as a heatmap
    plt.figure(figsize=(12, 2))
    sns.heatmap([scores], cmap="Blues", cbar_kws={'label': 'Conservation'})
    plt.xlabel('Reference Position')
    plt.yticks([])
    plt.title(f"{acc}_{donor}")
    plt.tight_layout()
    plt.savefig("conservation_heatmap.png", dpi=300)