In [62]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [63]:
input_file = "./gRNA_sequences/benchling_export.fasta" #gRNA sequences
targets = "./gRNA_sequences/targets.fasta"

In [64]:
#Read gRNA sequences using biopython fasta parser
records_targets = list(SeqIO.parse(targets, "fasta"))

Use greedy method to find sequence with large hamming distance to all 127 target sequences.

For each sequence position, use nucleotide with highest average hamming distance for that given sequence. Not global optimum but for large enough length sequence (60 bases long here), should be good enough 

In [65]:
maxHamSeq = []
bases = ['a','c','g','t']
#for each position
for i in range(0,len(records_targets[0].seq)):
    bases_count = np.zeros(len(bases))

    #for each target sequence
    for j in range(len(records_targets)):
        for k in range(len(bases)):
            #count for each base the number of appearances for that position
            if records_targets[j].seq[i] == bases[k]:
                bases_count[k] += 1

    ## Use base that is least likely among 127 sequences for that particular position
    maxHamSeq.append(bases[np.argmin(bases_count)])

    

In [66]:
averageHam = 0
hams = np.zeros(len(records_targets))
for i in range(len(records_targets[0].seq)):
    for j in range(len(records_targets)):
        #if the base for a certain target and position is not the same as the maxHam sequence 
        if records_targets[j].seq[i] != maxHamSeq[i]:
            averageHam += 1/len(records_targets)
            hams[j] += 1
print('Minimum hamming distance: {}'.format(np.min(hams)))
print('Average hamming distance: {}'.format(averageHam))



Minimum hamming distance: 40.0
Average hamming distance: 48.75590551181077


In [67]:
maxHamSeq = np.array(maxHamSeq).ravel()
# len(maxHamSeq)
maxHamSeq

array(['g', 't', 'c', 'a', 'g', 't', 'g', 'g', 'g', 'g', 'a', 'a', 'c',
       'c', 'a', 'a', 'a', 'c', 'g', 'a', 'a', 'g', 'g', 'a', 'c', 'a',
       'c', 'c', 'a', 'g', 't', 'c', 'g', 'g', 'c', 'c', 'g', 'g', 'c',
       'g', 'c', 'g', 'c', 'g', 'c', 'a', 'g', 'c', 'c', 'c', 'g', 'g',
       'c', 'g', 'g', 'g', 'c', 'g', 'c', 'g'], dtype='<U1')

In [68]:
seq = ''
for i in range(len(maxHamSeq)):
    seq += maxHamSeq[i]
print('Max Ham Sequence: ' + seq)
df = pd.DataFrame({'Maximum Hamming Distance sequence':seq},index = [0])

Max Ham Sequence: gtcagtggggaaccaaacgaaggacaccagtcggccggcgcgcgcagcccggcgggcgcg


In [69]:
df.to_csv('./Outputs/maxHammingDistanceSeq.csv')