<a href="https://colab.research.google.com/github/alexchen1999/covid-19-sample-strain-classification/blob/main/random_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install Bio
!pip install bcbio-gff
!pip install ncbi-acc-download

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.3.9-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 10.2 MB/s 
[?25hCollecting biopython>=1.79
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 44.7 MB/s 
Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting biothings-client>=0.2.6
  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)
Installing collected packages: biothings-client, mygene, biopython, Bio
Successfully installed Bio-1.3.9 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bcbio-gff
  Downloading bcbio-gff-0.6.9.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 455 kB/s 
Building wheels for collected pac

In [2]:
import pandas as pd
import subprocess
from Bio import Align, SeqIO
from sklearn.model_selection import train_test_split

In [3]:
covid_strains = ['B.1.1.7_sequences.csv', 'P.1_sequences.csv', 'B.1.617.2_sequences.csv', 'BA.1.1_sequences.csv']

In [22]:
alpha = pd.read_csv(covid_strains[0])
gamma = pd.read_csv(covid_strains[1])
delta = pd.read_csv(covid_strains[2])
omicron = pd.read_csv(covid_strains[3])

In [23]:
shortest = min([alpha.shape[0], gamma.shape[0], delta.shape[0], omicron.shape[0]])
print(shortest)

3735


In [24]:
alpha_sample = alpha.sample(n=shortest)
print(alpha_sample.shape[0])
gamma_sample = gamma.sample(n=shortest)
print(gamma_sample.shape[0])
delta_sample = delta.sample(n=shortest)
print(delta_sample.shape[0])
omicron_sample = omicron.sample(n=shortest)
print(omicron_sample.shape[0])

3735
3735
3735
3735


In [26]:
print(alpha_sample['Country'].unique())
print(gamma_sample['Country'].unique())
print(delta_sample['Country'].unique())
print(omicron_sample['Country'].unique())

['USA' 'India' 'Japan' 'Spain' 'Nigeria' 'Egypt' 'Pakistan' 'Iraq'
 'Austria' 'Bangladesh' 'West Bank' 'Djibouti']
['USA' 'Paraguay' 'Mexico' 'Italy' 'Brazil' 'Chile' 'Taiwan'
 'Dominican Republic' 'Peru']
['USA' 'Bangladesh' 'Mongolia' 'Egypt' 'India' 'China' 'Gabon' 'Bahrain'
 'Chile' 'Myanmar' 'Pakistan' 'Uzbekistan' 'Russia' 'Denmark' 'West Bank'
 'Jamaica' 'Japan']
['USA' 'Japan' 'France' 'Bahrain']


In [27]:
# From SARS-Cov-2 GFF file, find genomic location of Spike Glycoprotein
from BCBio import GFF
in_file = "GCF_009858895.2_ASM985889v3_genomic.gff"
in_handle = open(in_file)
features = []
for rec in GFF.parse(in_handle):
    features = rec.features
in_handle.close()

start = -1
end = -1

# spike glycoprotein locus tag GU280_gp02
for i in range(len(features)):
    if "GU280_gp02" in features[i].id:
      print(features[i])
      print(features[i].location)
      print(int(features[i].location._start))
      print(int(features[i].location._end))
      start = int(features[i].location._start)
      end = int(features[i].location._end)

type: gene
location: [21562:25384](+)
id: gene-GU280_gp02
qualifiers:
    Key: Dbxref, Value: ['GeneID:43740568']
    Key: ID, Value: ['gene-GU280_gp02']
    Key: Name, Value: ['S']
    Key: gbkey, Value: ['Gene']
    Key: gene, Value: ['S']
    Key: gene_biotype, Value: ['protein_coding']
    Key: gene_synonym, Value: ['spike glycoprotein']
    Key: locus_tag, Value: ['GU280_gp02']
    Key: source, Value: ['RefSeq']

[21562:25384](+)
21562
25384


In [28]:
# Find spike glycoprotein sequence from reference genome
record = SeqIO.read("NC_045512.2.fa", "fasta")
print(record.__dict__)
print(record._seq)
print(type(record._seq))
print(str(record._seq))

# Take from 21562 to 25384
spike_seq = str(record._seq)[start:end]
print(spike_seq)
print(len(spike_seq))

{'_seq': Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA'), 'id': 'NC_045512.2', 'name': 'NC_045512.2', 'description': 'NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome', 'dbxrefs': [], 'annotations': {}, '_per_letter_annotations': {}, 'features': []}
ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATC

In [29]:
alpha_sample_accessions = []
for i in range(shortest):
  alpha_sample_accessions.append(alpha_sample.iloc[i]['Accession'])

In [30]:
!ncbi-acc-download --help

usage: ncbi-acc-download [-h] [-m {nucleotide,protein}] [--api-key API_KEY]
                         [-e {none,loads,all,correct}]
                         [-F {fasta,genbank,featuretable,gff3}] [-o OUT]
                         [-p PREFIX] [-g RANGE] [-r] [--url] [-v]
                         NCBI-accession [NCBI-accession ...]

positional arguments:
  NCBI-accession

optional arguments:
  -h, --help            show this help message and exit
  -m {nucleotide,protein}, --molecule {nucleotide,protein}
                        Molecule type to download. Default: nucleotide
  --api-key API_KEY     Specify USER NCBI API key. More info at
                        https://www.ncbi.nlm.nih.gov/books/NBK25497/
  -e {none,loads,all,correct}, --extended-validation {none,loads,all,correct}
                        Perform extended validation. Possible options are
                        'none' to skip validation, 'loads' to check if the
                        sequence file loads in Biopython, or '

In [31]:
# Test NCBI accession download
test = alpha_sample_accessions[0]
print(test)

MZ039190.1


In [32]:
!ncbi-acc-download --format fasta $test

In [33]:
for i in range(3):
  subprocess.run(["ncbi-acc-download", "--format", "fasta", alpha_sample_accessions[i]])

In [38]:
# SeqIO
record = SeqIO.read("MZ039190.1.fa", "fasta")
print(record)  # first record
print(len(record.seq))
seq = str(record._seq)[start:end]
print(seq)
print(len(seq))

ID: MZ039190.1
Name: MZ039190.1
Description: MZ039190.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/SD-CDC-LC0043523/2021, complete genome
Number of features: 0
Seq('AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCAT...GAA')
29694
ACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGT

In [39]:
# Pairwise alignment
aligner = Align.PairwiseAligner()
aligner.mode = 'global'
alignments = aligner.align(seq, spike_seq)
print(alignments[0])
print(type(alignments[0]))
print(alignments[0].aligned)

A-----------CT----------C-A--A-T-T--A--C-----------C---C--CC-----T------------GCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTAT-C-T--CT--GGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTA---CCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAA

In [47]:
# Credit to https://2-bitbio.com/2018/06/one-hot-encode-dna-sequence-using.html

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
alpha_spike_seq_one_hot = pd.DataFrame(columns=["Accession", "Seq"])


In [48]:
# Alpha sample accessions
# Take from 21562 to 25384 from samples, should be a good enough approximation
alpha_spike_seq_df = pd.DataFrame(columns = ['Accession', 'Seq'])
for i in range(3):
  accession = alpha_sample_accessions[i]
  print(accession)
  subprocess.run(["ncbi-acc-download", "--format", "fasta", accession])
  record = SeqIO.read(accession + ".fa", "fasta")
  spike_seq = str(record._seq)[start:end]

  # One-hot encoding
  seq_array = array(list(spike_seq))
  label_encoder = LabelEncoder()
  integer_encoded_seq = label_encoder.fit_transform(seq_array)
    
  #one hot the sequence
  onehot_encoder = OneHotEncoder(sparse=False)
  #reshape because that's what OneHotEncoder likes
  integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
  onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)

  print(onehot_encoded_seq)

  alpha_spike_seq_df.loc[len(alpha_spike_seq_df.index)] = [accession, onehot_encoded_seq]
  subprocess.run(['rm', accession + ".fa"])

MZ039190.1
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
MZ056250.1
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]
MW912322.1
[[0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]
