In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy
from guide.dataset import GuideDataset
from collections import defaultdict

# Validating Bowtie Results

We want to confirm that the results we're getting from Bowtie make sense.

Since each gRNA row has a `gene_name`, we can check whether gRNAs targeting the same gene (1) appear on the same chromosome and, if so, whether they (2) are close to one another on that chromosome.

In [2]:
dataset = GuideDataset('data/example_guide_data_with_bowtie_with_mfold.tsv')
points = dataset.points

In [3]:
genes = list(set([p.row['gene_name'] for p in points]))
points_by_gene = defaultdict(list)
for p in points:
    points_by_gene[p.row['gene_name']].append(p)

print(len(points_by_gene))
print(len(genes))

17419
17419


In [4]:
list(points_by_gene[genes[0]])

[<guide.datapoint.GuideDatapoint at 0x108a1c860>,
 <guide.datapoint.GuideDatapoint at 0x108a1c908>,
 <guide.datapoint.GuideDatapoint at 0x108a1c9b0>]

In [46]:
perfect_genes = []
flawed_genes = []

def bowties(points): return [p.bowtie_result() for p in points if p.bowtie_result().exact_match()]
def chromosomes(points): return [b.chromosome() for b in points]

for gene in genes:
    points = points_by_gene[gene]
    _bowties = bowties(points)
    chromes = chromosomes(_bowties)
    if len(set(chromes)) == 1:
        indexes = [b.exact_match().index for b in _bowties]
        sigma, mean = numpy.std(indexes), numpy.mean(indexes)
        perfect_genes.append([gene, [sigma, mean]])
    elif len(set(chromes)) > 1:
        flawed_genes.append(gene)

In [47]:
len(perfect_genes)

17385

In [48]:
len(flawed_genes)

2

In [55]:
perfect_genes[0:30]

[['CCDC126', [14847.858925335553, 23632508.0]],
 ['KCNMB2', [1301.1986034251481, 178826966.16666666]],
 ['NAV3', [54432.505662976786, 77885962.5]],
 ['C14orf39', [556.39214787973253, 60484123.333333336]],
 ['RBM12B', [799.0, 93735522.0]],
 ['S100A5', [1205.2778465980366, 153538040.75]],
 ['TIGD3', [344.5, 65356373.5]],
 ['XKRX', [4031.4593744077342, 100918857.25]],
 ['GEMIN4', [2439.7598697868243, 750402.33333333337]],
 ['POP5', [905.33043000884493, 120580437.25]],
 ['DCTN1', [2701.148126260387, 74376098.0]],
 ['DNAL1', [3013.8645768348651, 73657676.75]],
 ['FERMT1', [1680.4487495904182, 6114209.0]],
 ['GDI2', [5484.2103250047667, 5789056.333333333]],
 ['KIAA1430', [5930.3652237868118, 185170027.75]],
 ['KLK8', [2347.0813554048118, 50997810.333333336]],
 ['NRSN1', [5595.2509494659844, 24139983.75]],
 ['SCLT1', [16595.510964941695, 129072738.25]],
 ['SPP2', [776.565032692047, 234059660.5]],
 ['PARP8', [7851.5404946347244, 50786443.166666664]],
 ['RAB11FIP5', [398.56158035281493, 7308867

For the most part, it looks like the Bowtie search results for gRNAs targeting the same genes make sense! Almost all of the gRNA exact matches are on the same chromosome, and fairly closely clustered together within that same chromosome, which we would expect if they are truly targeting the same gene. 