# Modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def loadCoronaVirusDNA(filename):
    f = open(filename)
    dna=''
    for line in f:
        if line[0] != '>':
            dna += line.rstrip()
        else:
            header = line.split()
            name = header[0][1:]
    return name, dna

# Virus Data

## 1. Wuhan 2019 nCov: 
Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
GenBank: MN908947.3

https://www.ncbi.nlm.nih.gov/nuccore/MN908947

In [3]:
coronavirusname, coronavirusdna = loadCoronaVirusDNA('Virus/MN908947_wuhan.fa')

## 2. SARS coronavirus GZ02, complete genome
GenBank: AY390556.1

https://www.ncbi.nlm.nih.gov/nuccore/AY390556.1

In [4]:
sarscoronavirusname, sarscoronavirusdna = loadCoronaVirusDNA('Virus/AY390556.1_2003_SARS.fa')

## 3. Human immunodeficiency virus 1, complete genome

NCBI Reference Sequence: NC_001802.1

https://www.ncbi.nlm.nih.gov/nuccore/NC_001802.1

In [5]:
hivname, hivdna = loadCoronaVirusDNA('Virus/NC_001802.1_HIV.fa')

# Verification

In [6]:
coronavirusname

'MN908947.3'

In [7]:
hivname

'NC_001802.1'

In [8]:
len(coronavirusdna)

29903

In [9]:
len(sarscoronavirusdna)

29760

In [10]:
len(hivdna)

9181

In [11]:
def reverseComplement(sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    csequence=''
    for s in sequence:
        csequence = complement[s] + csequence
    return csequence

In [12]:
def naive_with_rc(p, t):
    appearance = []
    
    for i in range(len(t) - len(p) + 1):
        for j in range(len(p)):
            match = True
            if t[i + j] != p[j]:
                match = False
                break                
        if match == True:
            appearance.append(i)
                
    cappearance = []
    cp = reverseComplement(p)
    
    for i in range(len(t) - len(cp) + 1):
        for j in range(len(cp)):
            match = True
            if t[i + j] != cp[j]:
                match = False
                break                
        if match == True:
            cappearance.append(i)
    #return len(appearance) + len(cappearance)
    return appearance, cappearance        

### Question 1
How many times does AGGT or its reverse complement (ACCT) occur in the coronavirus genome? E.g. if AGGT occurs 10 times and ACCT occurs 12 times, you should report 22.
Answer: 

In [13]:
appearances = naive_with_rc('AGGT', coronavirusdna)

In [14]:
len(appearances[0]) + len(appearances[1])

265

In [15]:
appearances

([5,
  21,
  234,
  258,
  329,
  374,
  623,
  1320,
  1557,
  1572,
  1587,
  1596,
  1740,
  1818,
  2070,
  2175,
  2379,
  2729,
  2766,
  3063,
  3132,
  3284,
  3416,
  3471,
  3636,
  4833,
  4854,
  4970,
  5106,
  5197,
  5430,
  5439,
  5607,
  5844,
  6530,
  6615,
  6984,
  7014,
  7270,
  7558,
  7584,
  7863,
  8369,
  9084,
  9174,
  9460,
  9996,
  10322,
  10587,
  10702,
  10956,
  11499,
  11532,
  12563,
  12795,
  12799,
  12904,
  12927,
  12981,
  13302,
  13395,
  13847,
  13973,
  14120,
  14274,
  14492,
  15470,
  15758,
  16179,
  16394,
  16433,
  16841,
  16871,
  17047,
  17531,
  18076,
  18146,
  18365,
  18377,
  18404,
  18788,
  18870,
  18970,
  19475,
  19822,
  20054,
  20066,
  20084,
  20354,
  20474,
  20566,
  20897,
  21149,
  21449,
  21488,
  21793,
  22254,
  22270,
  22329,
  22770,
  22892,
  23013,
  23208,
  23381,
  23475,
  23565,
  24004,
  24231,
  24274,
  24544,
  24837,
  25059,
  25124,
  25215,
  25442,
  25904,
  26270,
  26

### Question 2
As we will discuss, sometimes we would like to find **approximate matches** for P in T. That is, we want to find occurrences with one or more differences.

For Questions 5 and 6, make a new version of the naive function called naive_2mm that allows up to 2 mismatches per occurrence. Unlike for the previous questions, do not consider the reverse complement here. We're looking for approximate matches for P itself, not its reverse complement. ￼

For example, ACTTTA occurs twice in ACTTACTTGATAAAGT, once at offset 0 with 2 mismatches, and once at offset 4 with 1 mismatch. So naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT') should return the list [0, 4].

Hint: See this notebook for a few examples you can use to test your naive_2mm function.

How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?

In [16]:
def naive_2mm(p, t):
    appearance = []    
    
    for i in range(len(t) - len(p) + 1):
        mismatch = 0
        for j in range(len(p)):            
            if t[i + j] != p[j]:
                mismatch += 1
                if mismatch == 3:
                    break                
        if mismatch <= 2:
            appearance.append(i)
            
    return appearance

In [17]:
appearances = naive_2mm('TTCAAGCC', coronavirusdna)
appearances[0]
len(appearances)

119

In [18]:
codon = {'ATT': ('Isoleucine', 'Ile', 'I'),
         'ATC': ('Isoleucine', 'Ile', 'I'),
         'ATA': ('Isoleucine', 'Ile', 'I'),
         
         'CTT': ('Leucine', 'Leu', 'L'),
         'CTC': ('Leucine', 'Leu', 'L'),
         'CTA': ('Leucine', 'Leu', 'L'),
         'CTG': ('Leucine', 'Leu', 'L'),
         'TTA': ('Leucine', 'Leu', 'L'),
         'TTG': ('Leucine', 'Leu', 'L'),
         
         'GTT': ('Valine', 'Val', 'V'),
         'GTC': ('Valine', 'Val', 'V'),
         'GTA': ('Valine', 'Val', 'V'),
         'GTG': ('Valine', 'Val', 'V'),
         
         'TTT': ('Phenylalanine', 'Phe', 'F'),
         'TTC': ('Phenylalanine', 'Phe', 'F'),
         
         'ATG': ('Methionine', 'Met', 'M'),
         
         'TGT': ('Cysteine ', 'Cys', 'C'),
         'TGC': ('Cysteine ', 'Cys', 'C'),         
         
         'GCT': ('Alanine', 'Ala', 'A'),
         'GCC': ('Alanine', 'Ala', 'A'),
         'GCA': ('Alanine', 'Ala', 'A'),
         'GCG': ('Alanine', 'Ala', 'A'),
         
         'GGT': ('Glycine ', 'Gly', 'G'),
         'GGC': ('Glycine ', 'Gly', 'G'),
         'GGA': ('Glycine ', 'Gly', 'G'),
         'GGG': ('Glycine ', 'Gly', 'G'),
         
         'CCT': ('Proline', 'Pro', 'P'),
         'CCC': ('Proline', 'Pro', 'P'),
         'CCA': ('Proline', 'Pro', 'P'),
         'CCG': ('Proline', 'Pro', 'P'),
         
         'ACT': ('Threonine', 'Thr', 'T'),
         'ACC': ('Threonine', 'Thr', 'T'),
         'ACA': ('Threonine', 'Thr', 'T'),
         'ACG': ('Threonine', 'Thr', 'T'),
         
         'TCT': ('Serine', 'Ser', 'S'),
         'TCC': ('Serine', 'Ser', 'S'),
         'TCA': ('Serine', 'Ser', 'S'),
         'TCG': ('Serine', 'Ser', 'S'),
         'AGT': ('Serine', 'Ser', 'S'),
         'AGC': ('Serine', 'Ser', 'S'),
         
         'TAT': ('Tyrosine', 'Tyr', 'Y'),
         'TAC': ('Tyrosine', 'Tyr', 'Y'),
         
         'TGG': ('Tryptophan', 'Trp', 'W'),
         
         'CAA': ('Glutamine', 'Gln', 'Q'),
         'CAG': ('Glutamine', 'Gln', 'Q'),
         
         'AAT': ('Asparagine', 'Asn', 'N'),
         'AAC': ('Asparagine', 'Asn', 'N'),
         
         'CAT': ('Histidine ', 'His', 'H'),
         'CAC': ('Histidine ', 'His', 'H'),
         
         'GAA': ('Glutamic acid', 'Glu', 'E'),
         'GAG': ('Glutamic acid', 'Glu', 'E'),
         
         'GAT': ('Aspartic acid', 'Asp', 'D'),
         'GAC': ('Aspartic acid', 'Asp', 'D'),
                  
         'AAA': ('Lysine', 'Lys', 'K'),
         'AAG': ('Lysine', 'Lys', 'K'),
         
         'CGT': ('Arginine', 'Arg', 'R'),
         'CGC': ('Arginine', 'Arg', 'R'),
         'CGA': ('Arginine', 'Arg', 'R'),
         'CGG': ('Arginine', 'Arg', 'R'),
         'AGA': ('Arginine', 'Arg', 'R'),
         'AGG': ('Arginine', 'Arg', 'R'),
         
         'TAA': ('Stop', 'Stop', '___'),
         'TAG': ('Stop', 'Stop', '___'),
         'TGA': ('Stop', 'Stop', '___')
         }

In [19]:
def translate(dna):
    aminoacid = ''
    amino = ''
    a = ''
    i = 0
    dna_len = len(dna)
    DNA = dna.upper()
    
    while(1):
        if i+3 <= dna_len:
            c = DNA[i:i+3]
            residue = codon[c][0]
            res = codon[c][1]
            r = codon[c][2]
            
            if i == 0:
                aminoacid = residue
                amino = res
                a = a + r
                
            else:
                aminoacid = aminoacid + '-' + residue
                amino = amino + '-' + res
                a = a + r

            i = i + 3
        else:
            break
    return aminoacid, amino, a

In [20]:
seq = 'ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC'

In [21]:
result = translate(seq)

In [22]:
result

('Isoleucine-Alanine-Serine-Phenylalanine-Arginine-Leucine-Phenylalanine-Alanine-Arginine-Threonine-Arginine-Serine-Methionine-Tryptophan-Serine-Phenylalanine-Asparagine-Proline-Glutamic acid-Threonine-Asparagine-Isoleucine-Leucine',
 'Ile-Ala-Ser-Phe-Arg-Leu-Phe-Ala-Arg-Thr-Arg-Ser-Met-Trp-Ser-Phe-Asn-Pro-Glu-Thr-Asn-Ile-Leu',
 'IASFRLFARTRSMWSFNPETNIL')

In [23]:
protein_tuple = translate(coronavirusdna[26522:27190])

In [24]:
protein_tuple[2]

'MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIALLVQ'

In [26]:
coronavirusdna[21562:25383]

'ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACA

# Protein

## 1. Spike Glycoprotein of 2019 nCov

gene            21563..25384
                     /gene="S"
                     
CDS             21563..25384
                     /gene="S"
                     /note="structural protein"
                     /codon_start=1
                     /product="surface glycoprotein"
                     /protein_id="QHD43416.1"

In [27]:
spike_dna = coronavirusdna[21562:25383]
spike_protein_tuple = translate(spike_dna)
spike_protein_tuple[2]

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITG

In [28]:
spike_protein = spike_protein_tuple[2]

In [29]:
spike_protein

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITG

In [30]:
len(spike_protein)

1273

In [31]:
insert1 = spike_protein[71:78]
insert1

'GTNGTKR'

In [49]:
insert2 = spike_protein[143:151]
insert2

'YYHKNNKS'

In [46]:
insert3 = spike_protein[251:257]
insert3

'GDSSSG'

In [52]:
insert4 = spike_protein[676:684]
insert4

'QTNSPRRA'

## 2. Spike Protein of 2013 SARS Coronavirus

gene            21492..25259
                     /gene="S"
                     
CDS             21492..25259
                     /gene="S"
                     /note="putative"
                     /codon_start=1
                     /product="spike glycoprotein"
                     /protein_id="AAS00003.1"

In [34]:
sars_spike_dna = sarscoronavirusdna[21491:25258]
sars_spike_protein_tuple = translate(sars_spike_dna)
sars_spike_protein_tuple[2]

'MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYLTQDLFLPFYSNVTGFHTINHTFDNPVIPFKDGIYFAATEKSNVVRGWVFGSTMNNKSQSVIIINNSTNVVIRACNFELCDNPFFAVSKPMGTQTHTMIFDNAFNCTFEYISDAFSLDVSEKSGNFKHLREFVFKNKDGFLYVYKGYQPIDVVRDLPSGFNTLKPIFKLPLGINITNFRAILTAFLPAQDTWGTSAAAYFVGYLKPTTFMLKYDENGTITDAVDCSQNPLAELKCSVKSFEIDKGIYQTSNFRVVPSRDVVRFPNITNLCPFGEVFNATKFPSVYAWERKRISNCVADYSVLYNSTFFSTFKCYGVSATKLNDLCFSNVYADSFVVKGDDVRQIAPGQTGVIADYNYKLPDDFMGCVLAWNTRNIDATSTGNYNYKYRYLRHGKLRPFERDISNVPFSPDGKPCTPPALNCYWPLNDYGFYTTTGIGYQPYRVVVLSFELLNAPATVCGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFGRDVSDFTDSVRDPKTSEILDISPCSFGGVSVITPGTNASSEVAVLYQDVNCTDVSTAIHADQLTPAWRIYSTGNNVFQTQAGCLIGAEHVDTSYECDIPIGAGICASYHTVSLLRSTSQKSIVAYTMSLGADSSIAYSNNTIAIPTNFSISITTEVMPVSMAKTSVDCNMYICGDSTECANLLLQYGSFCTQLNRALSGIAAEQDRNTREVFAQVKQMYKTPTLKDFGGFNFSQILPDPLKPTKRSFIEDLLFNKVTLADAGFMKQYGECLGDINARDLICAQKFNGLTVLPPLLTDDMIAAYTAALVSGTATAGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAE

In [35]:
sars_spike_protein = sars_spike_protein_tuple[2]
len(sars_spike_protein)

1255

In [68]:
sars_spike_protein[660:670]

'HTVSLLRSTS'

In [69]:
beInserted1 = sars_spike_protein[70:78]
beInserted2 = sars_spike_protein[136:145]
beInserted3 = sars_spike_protein[236:253]
beInserted4 = sars_spike_protein[660:670]

In [70]:
print(beInserted1, beInserted2, beInserted3, beInserted4)

TINHTFDN FFAVSKPMG AFLPAQDTWGTSAAAYF HTVSLLRSTS


## 3. HIV gp120 Protein

In [36]:
env_dna = hivdna[5770:8340]
env_tuple = translate(env_dna)
env_tuple[2]

'MRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWKEATTTLFCASDAKAYDTEVHNVWATHACVPTDPNPQEVVLVNVTENFNMWKNDMVEQMHEDIISLWDQSLKPCVKLTPLCVSLKCTDLKNDTNTNSSSGRMIMEKGEIKNCSFNISTSIRGKVQKEYAFFYKLDIIPIDNDTTSYKLTSCNTSVITQACPKVSFEPIPIHYCAPAGFAILKCNNKTFNGTGPCTNVSTVQCTHGIRPVVSTQLLLNGSLAEEEVVIRSVNFTDNAKTIIVQLNTSVEINCTRPNNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHCNISRAKWNNTLKQIASKLREQFGNNKTIIFKQSSGGDPEIVTHSFNCGGEFFYCNSTQLFNSTWFNSTWSTEGSNNTEGSDTITLPCRIKQIINMWQKVGKAMYAPPISGQIRCSSNITGLLLTRDGGNSNNESEIFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTKAKRRVVQREKRAVGIGALFLGFLGAAGSTMGAASMTLTVQARQLLSGIVQQQNNLLRAIEAQQHLLQLTVWGIKQLQARILAVERYLKDQQLLGIWGCSGKLICTTAVPWNASWSNKSLEQIWNHTTWMEWDREINNYTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWFNITNWLWYIKLFIMIVGGLVGLRIVFAVLSIVNRVRQGYSPLSFQTHLPTPRGPDRPEGIEEEGGERDRDRSIRLVNGSLALIWDDLRSLCLFSYHRLRDLLLIVTRIVELLGRRGWEALKYWWNLLQYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQGACRAIRHIPRRIRQGLERILL'

In [37]:
gp = env_tuple[2]

In [81]:
whole = translate(hivdna)[2]

In [87]:
whole.find('HKNN')

-1

In [39]:
naive_2mm('GTNGTKR', gp)

[]

In [43]:
def naive_4mm(p, t):
    appearance = []    
    
    for i in range(len(t) - len(p) + 1):
        mismatch = 0
        for j in range(len(p)):            
            if t[i + j] != p[j]:
                mismatch += 1
                if mismatch == 5:
                    break                
        if mismatch <= 4:
            appearance.append(i)
            
    return appearance

In [44]:
naive_3mm('GTNGTKR', gp)

[]

In [45]:
gp

'MRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWKEATTTLFCASDAKAYDTEVHNVWATHACVPTDPNPQEVVLVNVTENFNMWKNDMVEQMHEDIISLWDQSLKPCVKLTPLCVSLKCTDLKNDTNTNSSSGRMIMEKGEIKNCSFNISTSIRGKVQKEYAFFYKLDIIPIDNDTTSYKLTSCNTSVITQACPKVSFEPIPIHYCAPAGFAILKCNNKTFNGTGPCTNVSTVQCTHGIRPVVSTQLLLNGSLAEEEVVIRSVNFTDNAKTIIVQLNTSVEINCTRPNNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHCNISRAKWNNTLKQIASKLREQFGNNKTIIFKQSSGGDPEIVTHSFNCGGEFFYCNSTQLFNSTWFNSTWSTEGSNNTEGSDTITLPCRIKQIINMWQKVGKAMYAPPISGQIRCSSNITGLLLTRDGGNSNNESEIFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTKAKRRVVQREKRAVGIGALFLGFLGAAGSTMGAASMTLTVQARQLLSGIVQQQNNLLRAIEAQQHLLQLTVWGIKQLQARILAVERYLKDQQLLGIWGCSGKLICTTAVPWNASWSNKSLEQIWNHTTWMEWDREINNYTSLIHSLIEESQNQQEKNEQELLELDKWASLWNWFNITNWLWYIKLFIMIVGGLVGLRIVFAVLSIVNRVRQGYSPLSFQTHLPTPRGPDRPEGIEEEGGERDRDRSIRLVNGSLALIWDDLRSLCLFSYHRLRDLLLIVTRIVELLGRRGWEALKYWWNLLQYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQGACRAIRHIPRRIRQGLERILL'