In [1]:
import T2T_ACE.alignment_utilities as au
import T2T_ACE.interval_parsing  as ip
import T2T_ACE.alignment_visualization_utilities as avu

from T2T_ACE.validator import get_flanking_pairs

import warnings
warnings.filterwarnings( "ignore", module = "seaborn\..*" )
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

In [2]:
class ReferenceLocations:
    whole_genome = False
    if whole_genome:
        hg002t2t = "/Users/fleharty/resources/hg002_v1.0.fasta.gz"
        hg002t2tmat = "/Users/fleharty/resources/hg002mat_v1.0.fasta.gz"
        hg002t2tpat = "/Users/fleharty/resources/hg002pat_v1.0.fasta.gz"
        hg38 = "/Users/fleharty/resources/Homo_sapiens_assembly38.fasta"
    else:
        hg002t2t = "/Users/fleharty/resources/hg002_v1.0.chr22.fasta.gz"
        hg002t2tmat = "/Users/fleharty/resources/hg002mat_v1.0.chr22.fasta.gz"
        hg002t2tpat = "/Users/fleharty/resources/hg002pat_v1.0.chr22.fasta.gz"
        hg38 = "/Users/fleharty/resources/Homo_sapiens_assembly38.chr22.fasta.gz"


In [3]:
hg002t2t = au.load_reference(ReferenceLocations.hg002t2t)

INFO:root:Loading reference from: /Users/fleharty/resources/hg002_v1.0.chr22.fasta.gz


In [4]:
hg38 = au.load_reference(ReferenceLocations.hg38)

INFO:root:Loading reference from: /Users/fleharty/resources/Homo_sapiens_assembly38.chr22.fasta.gz


In [5]:
# SIMPLE SMALL HET DELETION
simple_het_deletion = "chr1:103900576-103901103"
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:103900576-103901103", hg38, hg002t2t, True)
get_flanking_pairs("chr1:103900576-103901103", ReferenceLocations.hg38, hg38, hg002t2t)


ERROR:root:Error: Chromosome chr1 not found in reference genome.


KeyError: 'Chromosome chr1 not found in reference genome.'

In [None]:
# SIMPLE SMALL HOM DELETION
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:176509082-176509637", hg38, hg002t2t, True)
get_flanking_pairs("chr1:176509082-176509637", ReferenceLocations.hg38, hg38, hg002t2t)


In [None]:
# COMPLEX DELETION chr1:247687159-247693213
# Has 6 matches to hg38 chr1, why?
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:247687159-247693213", hg38, hg002t2t)
get_flanking_pairs("chr1:247687159-247693213", ReferenceLocations.hg38, hg38, hg002t2t)


In [None]:
# This is a DUP that we are going to check if it is a deletion
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:248407554-248446847", hg38, hg002t2t)
get_flanking_pairs("chr1:248407554-248446847", ReferenceLocations.hg38, hg38, hg002t2t)

In [None]:
# This is a Dragen DEL call that is filtered by MinQUAL
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:3643704-3644335", hg38, hg002t2t)
get_flanking_pairs("chr1:3643704-3644335", ReferenceLocations.hg38, hg38, hg002t2t)

In [None]:
# This is a Dragen DEL call that is filtered by cnvLength
# Note that the joined sequence on hg002 only matches (0-50) on PATERNAL, and there is no (51-100) match on PATERNAL.
# I think this is because the Dragen caller may be estimating the size too small?
#ev = evaluate_deletion(ReferenceLocations.hg38, ReferenceLocations.hg002t2t, "chr1:14109814-14112366", hg38, hg002t2t)
get_flanking_pairs("chr1:14109814-14112366", ReferenceLocations.hg38, hg38, hg002t2t)

In [None]:
get_flanking_pairs("chr1:125079843-125085826", ReferenceLocations.hg38, hg38, hg002t2t)


In [None]:
stuff = ['chr10:39521557-39533371']
other_stuff = ['chr10_PATERNAL:39521557-39533371', 'chr10_MATERNAL:39475935-39487740', 'chr10_MATERNAL:39538510-39550326', 
               'chr10_MATERNAL:39486365-39498169', 'chr10_MATERNAL:39496794-39508598', 'chr10_MATERNAL:39507223-39519027', 
               'chr10_MATERNAL:39517652-39529456', 'chr10_MATERNAL:39528081-39539885']

In [None]:
avu.PlotIntervals(stuff, other_stuff).plot_interval_on_chromo()

In [None]:
stuff = []
with open('../resources/chr1_deletions.txt', 'r') as file:
    for line in file:
        try:
            line = line.strip()  # Remove leading/trailing whitespaces and newline characters
            #print(line, check_interval(line, ReferenceLocations.hg38, hg38, hg002t2t))
            print("Event:", line, ip.interval_size(line))
            stuff.append(get_flanking_pairs(line, ReferenceLocations.hg38, hg38, hg002t2t))
            print()
        except:
            raise ValueError("Error on line: " + line)


In [None]:
stuff[0]

In [None]:
stuff[1]

In [None]:
stuff[2]['ref_flank']

In [None]:
stuff[2]['truth_flank']


In [None]:
import pysam
with pysam.FastaFile("../test/mock_reference.fasta") as ref_genome:
    # Fetch the sequence
    sequence = ref_genome.fetch("chr1", 0, 101001)
    print(sequence)

In [None]:
import pysam

In [None]:
def read_vcf_pysam(file_path):
    vcf_file = pysam.VariantFile(file_path)
    variants = []

    for record in vcf_file:
        # Extract desired information from each record
        # Example: chromosome, position, id, reference base, alternative base(s), etc.
        variant_info = {
            'CHROM': record.chrom,
            'POS': record.pos,
            'ID': record.id,
            'REF': record.ref,
            'ALT': record.alts,
            'QUAL': record.qual,
            'FILTER': record.filter.keys(),
            'INFO': dict(record.info),
            'FORMAT': record.format
        }
        variants.append(variant_info)

    return variants


In [None]:
variants = pysam.VariantFile("../data/NA24385.cnv_sv.vcf.gz")

for record in variants:
    if record.filter.keys() == ['PASS']:
        print(record.id, record.info["SVTYPE"], record.info["SVCLAIM"][0], record.samples[0]["GT"])

In [6]:
import importlib
import T2T_ACE.cnv_evaluator as ce
importlib.reload(ce)

<module 'T2T_ACE.cnv_evaluator' from '/Users/fleharty/T2T-ACE/T2T_ACE/cnv_evaluator.py'>

In [7]:
cnv_evaluator = ce.CNVEvaluator(ReferenceLocations.hg38, ReferenceLocations.hg002t2tmat, ReferenceLocations.hg002t2tpat)

INFO:root:Loading reference from: /Users/fleharty/resources/Homo_sapiens_assembly38.chr22.fasta.gz
INFO:root:Loading reference from: /Users/fleharty/resources/hg002mat_v1.0.chr22.fasta.gz
INFO:root:Loading reference from: /Users/fleharty/resources/hg002pat_v1.0.chr22.fasta.gz


In [8]:
alignments = cnv_evaluator.align_cnv_for_evaluation("chr22:17289461-17298218", "DEL", "0/1")

In [9]:
au.extract_interval_from_hit(alignments.left_flank_maternal_hits[0])

'chr22_MATERNAL:19802186-19802686'

In [10]:
au.extract_interval_from_hit(alignments.left_flank_paternal_hits[0])


'chr22_PATERNAL:15909633-15910133'

In [11]:
alignments.right_flank_maternal_hits

[<mappy.Alignment at 0x139b698c0>]

In [12]:
alignments.left_flank_maternal_hits[0]

<mappy.Alignment at 0x139b69700>

In [13]:
alignments.calling_reference_hits

[<mappy.Alignment at 0x139b6a5e0>]

In [14]:
alignments.genotype

'0/1'

In [15]:
alignments.location

'chr22:17289461-17298218'

In [16]:
alignments.calling_reference_hits

[<mappy.Alignment at 0x139b6a5e0>]