In [1]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeable
Collecting biopython
  Downloading biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.7 MB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.81
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
import os
import pandas as pd
import itertools
import argparse
from Bio.Seq import Seq
from Bio import SeqIO
from alternating_string import is_alternating
from tailor_match import filter_tails

fasta = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/c_elegans.PRJNA13758.WS279.genomic.fa"
records = SeqIO.to_dict(SeqIO.parse(fasta, 'fasta')) 

In [2]:
# test case, from sequencing, should be true
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTT",
    tail = "T",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

In [4]:
# make last 4 same character, should be false
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAAAT",
    tail = "T",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAAAT
Tail: T
Last 4 nucleotides: AAAA
Last 4 nucleodies are the same (AAAA)....fail


False

In [5]:
# make 2 nucleotide tail TT, should be true
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTTT",
    tail = "TT",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAGTTT
Tail: TT
Last 4 nucleotides: AAGT
TAAGGAGTGTTTGCAACAAAAAAAAGTAG
Edit distance == tail length..pass


True

In [6]:
# make 2 nucleotide tail TG, should be false
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTTG",
    tail = "TG",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAGTTG
Tail: TG
Last 4 nucleotides: AAGT
TAAGGAGTGTTTGCAACAAAAAAAAGTAG
Edit distance does not equal tail length: TAAGGAGTGTTTGCAACAAAAAAAAGTTG vs. TAAGGAGTGTTTGCAACAAAAAAAAGTAG with tail TG
check for alternating tail...
Other tail not meeting some specification...fail


False

In [7]:
# make 3 nucleotide tail TGT, should be true
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTTT",
    tail = "TGT",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAGTTT
Tail: TGT
Last 4 nucleotides: AAAG
TAAGGAGTGTTTGCAACAAAAAAAAGTAGG
Edit distance does not equal tail length: TAAGGAGTGTTTGCAACAAAAAAAAGTTT vs. TAAGGAGTGTTTGCAACAAAAAAAAGTAGG with tail TGT
check for alternating tail...
Tail is alternating...pass


True

In [8]:
# make  nucleotide tail TGT, should be true
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTTGT",
    tail = "TGT",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAGTTGT
Tail: TGT
Last 4 nucleotides: AAGT
TAAGGAGTGTTTGCAACAAAAAAAAGTAGG
Edit distance does not equal tail length: TAAGGAGTGTTTGCAACAAAAAAAAGTTGT vs. TAAGGAGTGTTTGCAACAAAAAAAAGTAGG with tail TGT
check for alternating tail...
Tail is alternating...pass


True

In [9]:
# make  nucleotide tail TGT, should be true
filter_tails(
    sequence = "TAAGGAGTGTTTGCAACAAAAAAAAGTTGTG",
    tail = "TGTG",
    chrom = "I", 
    start = 31522, 
    end = 31549, 
    strand = "+",
    records = records, 
    N = 4
)

Sequence: TAAGGAGTGTTTGCAACAAAAAAAAGTTGTG
Tail: TGTG
Last 4 nucleotides: AAGT
TAAGGAGTGTTTGCAACAAAAAAAAGTAGGT
Tail is alternating...pass


True

In [None]:
# check some minus strand examples

In [1]:
import sys
import os
import pandas as pd
import itertools
import argparse
from Bio.Seq import Seq
from Bio import SeqIO
from alternating_string import is_alternating
from tailor_match import filter_tails

fasta = "/fs/ess/PCON0160/ben/genomes/c_elegans/WS279/c_elegans.PRJNA13758.WS279.genomic.fa"
records = SeqIO.to_dict(SeqIO.parse(fasta, 'fasta')) 

In [2]:
# I       272600  272623  TACAATGATGATGATGAGGATGTG:G      1.0     -       sense   piRNA   none    piRNA   1       21      WBGene00169222  C53D5.8 21ur-15400      0.04746753335871926
filter_tails(
    sequence = "TACAATGATGATGATGAGGATGTG",
    tail = "G",
    chrom = "I", 
    start = 272600, 
    end = 272623, 
    strand = "-",
    records = records, 
    N = 4
)


Sequence: TACAATGATGATGATGAGGATGTG
Tail: G
Last 4 nucleotides: ATGT
Print tail length is 1...pass


True

In [3]:
filter_tails(
    sequence = "TACAATGATGATGATGAGGATGTGGC",
    tail = "GGC",
    chrom = "I", 
    start = 272600, 
    end = 272623, 
    strand = "-",
    records = records, 
    N = 4
)

Sequence: TACAATGATGATGATGAGGATGTGGC
Tail: GGC
Last 4 nucleotides: ATGT
TACAATGATGATGATGAGGATGTATG
Edit distance == tail length TACAATGATGATGATGAGGATGTGGC (seq) vs. TACAATGATGATGATGAGGATGTATG (ref) with tail GGC..pass


True

In [5]:
filter_tails(
    sequence = "TACAATGATGATGATGAGGATGTGTGTGTGT",
    tail = "GTGTGTGT",
    chrom = "I", 
    start = 272600, 
    end = 272623, 
    strand = "-",
    records = records, 
    N = 4
)

Sequence: TACAATGATGATGATGAGGATGTGTGTGTGT
Tail: GTGTGTGT
Last 4 nucleotides: ATGT
TACAATGATGATGATGAGGATGTATGTGCAA
Tail is alternating...pass


True