## Tutorial: Protein Sequence Analysis of COVID-19 using BioPython 
(https://www.youtube.com/watch?v=dxVKG2gNSos + my additions)

In [20]:
from Bio import SeqIO
from Bio.SeqUtils import GC

### Read/write file

In [2]:
for record in SeqIO.parse("../data/covid-sequence.fasta", "fasta"):
    print(record.id)
    print("\n")
    
for record in SeqIO.parse("../data/covid-sequence.fasta", "fasta"):
    print(record)

MN908947.3


ID: MN908947.3
Name: MN908947.3
Description: MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome
Number of features: 0
Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')


In [25]:
SeqIO.write(record, "../data/covid-sequence-output.fasta", "fasta")

1

### Analysis

In [3]:
ncov_record = SeqIO.read("../data/covid-sequence.fasta", "fasta")
ncov_record

SeqRecord(seq=Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA'), id='MN908947.3', name='MN908947.3', description='MN908947.3 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', dbxrefs=[])

In [4]:
# Get COVID DNA
ncov_dna = ncov_record.seq
ncov_dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [5]:
# Len of sequence
len(ncov_dna)

29903

In [6]:
# Protein Synthesis
# DNA -> mRNA -> protein

# Transcription 
ncov_mrna = ncov_dna.transcribe()
# print(ncov_mrna)

# Translate to AminoAcids/Protein
ncov_prot = ncov_mrna.translate()



In [7]:
ncov_prot

Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK')

In [8]:
len(ncov_prot)

9967

In [9]:
len(ncov_dna)/3

9967.666666666666

In [10]:
ncov_prot

Seq('IKGLYLPR*QTNQLSISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QD...KKK')

In [11]:
# before stop codone
ncov_aa = ncov_prot.split("*")

In [12]:
ncov_clean = [str(i) for i in ncov_aa]

In [13]:
ncov_clean

['IKGLYLPR',
 'QTNQLSISCRSVL',
 'TNFKICVAVTRLHA',
 'CTHAV',
 'LITNYCR',
 'QDTSNSSIFCRLLTVSSVLQPIISTSRFRPGVTER',
 'DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS',
 'RWHLWLSRS',
 'KRRFAST',
 'TALCVHQTFGCSNCTSWSCYG',
 'AGSRTRRHSVRS',
 'W',
 'DTWCPCPSCGRNTSGLPQGSSS',
 'ER',
 '',
 'RSWWP',
 'LRRRSKVI',
 'LRRRAWH',
 'SL',
 'RFSRKLEH',
 'T',
 'QWCYP',
 'THA',
 'A',
 'RRGIHSLCR',
 'QLLWP',
 'WLPS',
 'VH',
 'RPSSTCW',
 'SFMHFVRTTGLY',
 'H',
 'EGCILLP',
 'T',
 'A',
 'NCLVHGTF',
 'KEL',
 'IADTF',
 'N',
 'IGKEI',
 'HLQWGMSKFCISLKFHNQDYSTKG',
 'KEKA',
 'WLYG',
 'NSICLSSCVTK',
 'MQPNVPFNSHEV',
 'SLW',
 'NFMADGRFC',
 'SHLRILWH',
 'EFD',
 'RRCHYLWLLTPKCCC',
 'NLLSSMSQFRSRT',
 'A',
 'SCRIP',
 '',
 'IWLENHSS',
 'GWSHYCLWRLCVLLCWLP',
 'QVCLLGSTC',
 'R',
 'HRL',
 'PYRCCWRRFRRS',
 '',
 'QPS',
 'NTPKRESQHQYCW',
 'L',
 'T',
 '',
 'RDRHYFGIFFCFHKCFCGNCERFGL',
 'SIQTNC',
 'ILW',
 'F',
 'SYKRKS',
 'KRCLEYW',
 'TEINTESSLCICIRGCSCCTINFLPHS',
 'NCSKFCACFTEGRYNNTRWNFTVFTETH',
 'CYDVHI',
 'FGY',
 'QSSCNGLHYRWCCSVDF

In [14]:
from collections import Counter

In [15]:
# Count the Frequence of Amino Acids
Counter(ncov_prot).most_common(10)

[('L', 886),
 ('S', 810),
 ('*', 774),
 ('T', 679),
 ('C', 635),
 ('F', 593),
 ('R', 558),
 ('V', 548),
 ('Y', 505),
 ('N', 472)]

In [23]:
# Count GC%
print(100 * float(ncov_dna.count("G") + ncov_dna.count("C")) / len(ncov_dna))

print(GC(ncov_dna))

37.97277865097148
37.97277865097148


## 3D structure of COVID

#### File format:
- pdb: PDBParser
- cif: MMCIFParser

#### links:
- https://www.ncbi.nlm.nih.gov/Structure/pdb/6LU7 (The crystal structure of COVID-19 main protease in complex with an inhibitor N3)
- Protein Data Bank

#### what to install? 
- nglview
- py3Dmol
- pytraj
- jupyter-nbextension enable nglview --py --sys-prefix
- nglview enable
- jupyter-labextension install @jupyter-widget/jupyterlab-manager
- jupyter-labextension install nglview-js-widgets

In [16]:
from Bio.PDB import PDBParser

In [17]:
parser = PDBParser()
structure = parser.get_structure("6lu7" , "../data/6lu7.pdb")



In [19]:
len(structure)
# check structure, something wrong

1