In [1]:
"""
Author:   Cody Hawkins
Date:     Sep 15,2020
Desc:     Find the longest common protein sequence
          between nsp2, A0A1B3Q5V8 and Q9YMB7
"""
def LCS(X, Y):

  m = len(X)
  n = len(Y)

  c = [[0 for x in range(n+1)] for x in range(m+1)]
  b = [["" for x in range(n+1)] for x in range(m+1)]

  for i in range(1,m+1):
    for j in range(1,n+1):
      if X[i-1] == Y[j-1]:
        c[i][j] = c[i-1][j-1] + 1
        b[i][j] = "UL"
      elif c[i-1][j] >= c[i][j-1]:
        c[i][j] = c[i-1][j]
        b[i][j] = "up"
      else:
        c[i][j] = c[i][j-1]
        b[i][j] = " "
  return c, b

# printout must be i - 1 and not i for correct output
def print_lcs(b, X, i, j, h):
  if i == 0 or j == 0:
    return
  if b[i][j] == "UL":
    print_lcs(b, X, i-1, j-1, h)
    h.append(X[i-1])
  elif b[i][j] == "up":
    print_lcs(b, X, i-1, j, h)
  else:
    print_lcs(b, X, i, j-1, h)
  return h


# sanity check to make sure output is correct
X = "ABCBDAB"
Y = "BDCABA"
c, b = LCS(X, Y)

holder = []
common = print_lcs(b, X, len(X), len(Y), holder)
print("X: " + X +"\nY: " +  Y)
print("LCS: " + "".join(common))
print("LCS length: " + str(len(common)))

X: ABCBDAB
Y: BDCABA
LCS: BCBA
LCS length: 4


In [2]:
# import protein sequence text file
from google.colab import files
data = files.upload()

Saving LCS_sequences.txt to LCS_sequences (1).txt


In [3]:
# parse text file
filename = "LCS_sequences.txt"
temp = data[filename].decode("utf-8").split("\r\n")

In [4]:
# remove the lower case letters from the protein sequences
nsp2 = []
A0A1B3Q5V8 = []
Q9YMB7 = []
for i in range(len(temp) - 1):
  for j in range(len(temp[i])):
    if i == 0:
      if temp[i][j].isupper():
        nsp2.append(temp[i][j])
    elif i == 1:
      if temp[i][j].isupper():
        A0A1B3Q5V8.append(temp[i][j])
    elif i == 2:
      if temp[i][j].isupper():
        Q9YMB7.append(temp[i][j])

# check to ensure lowercase letters removed
print(nsp2)
print("")
print(A0A1B3Q5V8)
print("")
print(Q9YMB7)


['A', 'Y', 'T', 'R', 'Y', 'V', 'D', 'N', 'N', 'F', 'C', 'G', 'P', 'D', 'G', 'Y', 'P', 'L', 'E', 'C', 'I', 'K', 'D', 'L', 'L', 'A', 'R', 'A', 'G', 'K', 'A', 'S', 'C', 'T', 'L', 'S', 'E', 'Q', 'L', 'D', 'F', 'I', 'D', 'T', 'K', 'R', 'G', 'V', 'Y', 'C', 'C', 'R', 'E', 'H', 'E', 'H', 'E', 'I', 'A', 'W', 'Y', 'T', 'E', 'R', 'S', 'E', 'K', 'S', 'Y', 'E', 'L', 'Q', 'T', 'P', 'F', 'E', 'I', 'K', 'L', 'A', 'K', 'K', 'F', 'D', 'T', 'F', 'N', 'G', 'E', 'C', 'P', 'N', 'F', 'V', 'F', 'P', 'L', 'N', 'S', 'I', 'I', 'K', 'T', 'I', 'Q', 'P', 'R', 'V', 'E', 'K', 'K', 'K', 'L', 'D', 'G', 'F', 'M', 'G', 'R', 'I', 'R', 'S', 'V', 'Y', 'P', 'V', 'A', 'S', 'P', 'N', 'E', 'C', 'N', 'Q', 'M', 'C', 'L', 'S', 'T', 'L', 'M', 'K', 'C', 'D', 'H', 'C', 'G', 'E', 'T', 'S', 'W', 'Q', 'T', 'G', 'D', 'F', 'V', 'K', 'A', 'T', 'C', 'E', 'F', 'C', 'G', 'T', 'E', 'N', 'L', 'T', 'K', 'E', 'G', 'A', 'T', 'T', 'C', 'G', 'Y', 'L', 'P', 'Q', 'N', 'A', 'V', 'V', 'K', 'I', 'Y', 'C', 'P', 'A', 'C', 'H', 'N', 'S', 'E', 'V', 'G', 'P',

In [8]:
# LCS of nsp2 and A0A1B3QV8
c, b = LCS(nsp2, A0A1B3Q5V8)
hold = []
m1 = len(nsp2)
n1 = len(A0A1B3Q5V8)

lcs1 = print_lcs(b, nsp2, m1, n1, hold)
print("nsp2: " + "".join(nsp2) +"\nA0A1B3Q5V8: " +  "".join(A0A1B3Q5V8))
print("LCS: " + "".join(lcs1))
print("LCS length: " + str(len(lcs1)))

nsp2: AYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFPLNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTGDFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESGLKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNLLEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGNFKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAAITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKLKPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLVNKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEIIFLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEKYCALAPNMMVTNNTFTLKGG
A0A1B3Q5V8: FLTDQYGFDADGVLAAPIKEVLGDKGAGMSRTADGYELPSGIVKVAVKVVRKNLPVSKQSIFTVLGVTERVVDGFYYPYSTNSVVSYTKPRAGATVGNTVQSVMLSVYGTEAYNPVTPVVRLRCSSCDFYGWVPVKDLGCVTCSCAAVHQSCIDAESAGLIKQGAVMLVDRSPSMRVVPGNRYVAFGGAIWSPIGKVNGVQVWVPRAYSCVAGDHSGAVGSGDVTINKEIMSLIIDGVRIDDEVLEQPSCGVLIANLEDPSAAPRVHTVDSLRQLCVDNNDTKDEFHPGLSYHFYRACWYGVLTAKSFGAFKELLQSEEVRLSHFCANIRRCLDRALNWARTT

In [10]:
# LCS of nsp2 and Q9YMB7
d, e = LCS(nsp2, Q9YMB7)
new_hold = []
m2 = len(nsp2)
n2 = len(Q9YMB7)

lcs2 = print_lcs(e, nsp2, m2, n2, new_hold)
print("nsp2: " + "".join(nsp2) +"\nQ9YMB7: " +  "".join(Q9YMB7))
print("LCS: " + "".join(lcs2))
print("LCS length: " + str(len(lcs2)))

nsp2: AYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKRGVYCCREHEHEIAWYTERSEKSYELQTPFEIKLAKKFDTFNGECPNFVFPLNSIIKTIQPRVEKKKLDGFMGRIRSVYPVASPNECNQMCLSTLMKCDHCGETSWQTGDFVKATCEFCGTENLTKEGATTCGYLPQNAVVKIYCPACHNSEVGPEHSLAEYHNESGLKTILRKGGRTIAFGGCVFSYVGCHNKCAYWVPRASANIGCNHTGVVGEGSEGLNDNLLEILQKEKVNINIVGDFKLNEEIAIILASFSASTSAFVETVKGLDYKAFKQIVESCGNFKVTKGKAKKGAWNIGEQKSILSPLYAFASEAARVVRSIFSRTLETAQNSVRVLQKAAITILDGISQYSLRLIDAMMFTSDLATNNLVVMAYITGGVVQLTSQWLTNIFGTVYEKLKPVLDWLEEKFKEGVEFLRDGWEIVKFISTCACEIVGGQIVTCAKEIKESVQTFFKLVNKFLALCADSIIIGGAKLKALNLGETFVTHSKGLYRKCVKSREETGLLMPLKAPKEIIFLEGETLPTEVLTEEVVLKTGDLQPLEQPTSEAVEAPLVGTPVCINGLMLLEIKDTEKYCALAPNMMVTNNTFTLKGG
Q9YMB7: YVDQYMCGADGKPVGDFKDYFGDEDIIEFEGEEYHCAWTTVRDEKPLNQQTLFTIQEIQYNLDIPHKLPNCATRHVAPPVKKNSKIVLYKKLYDIFGSPFMGNGDCLSKCFDTLHFIAATRCPCGSESSGVGDWTGFKTACCSGKVKGVTLGDIKPGDAVMSAGKVKFFANCVLQYAGDVEGVSIWKVIKTFTVDETVCTPGFEGELNDFIKPESKSLVACSVKRAFITGDIDDAVHDCIITGKLDLSTNLFGNVGLFKKTPWFQKCGALFVDAWKVVEELCGSLTLTYKQIYEVVASLCTSAFTIVNYTFVVPDNRVKDLVDKCVKVLVKAFDVFTQIITIAGIEA