<a href="https://colab.research.google.com/github/alexchen1999/covid-19-sample-strain-classification/blob/main/random_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install Bio
!pip install bcbio-gff
!pip install ncbi-acc-download

Collecting bcbio-gff
  Downloading bcbio-gff-0.6.9.tar.gz (44 kB)
[K     |████████████████████████████████| 44 kB 2.6 MB/s 
Building wheels for collected packages: bcbio-gff
  Building wheel for bcbio-gff (setup.py) ... [?25l[?25hdone
  Created wheel for bcbio-gff: filename=bcbio_gff-0.6.9-py3-none-any.whl size=15954 sha256=3f3089a1f738a82b7dfb598ce3a81a2dd53a56f1e89f52ff46806ba44a8ad72d
  Stored in directory: /root/.cache/pip/wheels/fb/fc/e0/2e7658046d32b794fdfd1e0ec266dc8bae21c1811b18951b6d
Successfully built bcbio-gff
Installing collected packages: bcbio-gff
Successfully installed bcbio-gff-0.6.9


In [2]:
import pandas as pd
import subprocess
from Bio import SeqIO
from sklearn.model_selection import train_test_split

In [3]:
covid_strains = ['B.1.1.7_sequences.csv', 'P.1_sequences.csv', 'B.1.617.2_sequences.csv', 'BA.1.1_sequences.csv']

In [10]:
alpha = pd.read_csv(covid_strains[0])
gamma = pd.read_csv(covid_strains[1])
delta = pd.read_csv(covid_strains[2])
omicron = pd.read_csv(covid_strains[3])

In [11]:
shortest = min([alpha.shape[0], gamma.shape[0], delta.shape[0], omicron.shape[0]])
print(shortest)

3735


In [14]:
alpha_sample = alpha.sample(n=shortest)
print(alpha_sample.shape[0])
gamma_sample = gamma.sample(n=shortest)
print(gamma_sample.shape[0])
delta_sample = delta.sample(n=shortest)
print(delta_sample.shape[0])
omicron_sample = omicron.sample(n=shortest)
print(omicron_sample.shape[0])

3735
3735
3735
3735


In [15]:
print(alpha_sample['Country'].unique())
print(gamma_sample['Country'].unique())
print(delta_sample['Country'].unique())
print(omicron_sample['Country'].unique())

['USA' 'Togo' 'West Bank' 'Spain' 'Nigeria' 'Japan' 'Iraq' 'Bangladesh'
 'Djibouti' 'Pakistan' 'Egypt' 'Saudi Arabia' 'Austria' 'Taiwan' 'Mexico']
['USA' 'Paraguay' 'Chile' 'Mexico' 'Dominican Republic' 'Brazil' 'Taiwan'
 'Peru' 'Italy']
['USA' 'Egypt' 'Bangladesh' 'Mongolia' 'Bahrain' 'Myanmar' 'West Bank'
 'Jamaica' 'India' 'Gabon' 'Uzbekistan' 'Pakistan' 'Russia' 'China'
 'Denmark' 'Chile' 'Japan']
['USA' 'Japan' 'Bahrain' 'France']


In [66]:
from BCBio import GFF
in_file = "GCF_009858895.2_ASM985889v3_genomic.gff"
in_handle = open(in_file)
features = []
for rec in GFF.parse(in_handle):
    features = rec.features
in_handle.close()

# spike glycoprotein locus tag GU280_gp02
for i in range(len(features)):
    if "GU280_gp02" in features[i].id:
      print(features[i])
      print(features[i].location)
      print(int(features[i].location._start))
      print(int(features[i].location._end))

type: gene
location: [21562:25384](+)
id: gene-GU280_gp02
qualifiers:
    Key: Dbxref, Value: ['GeneID:43740568']
    Key: ID, Value: ['gene-GU280_gp02']
    Key: Name, Value: ['S']
    Key: gbkey, Value: ['Gene']
    Key: gene, Value: ['S']
    Key: gene_biotype, Value: ['protein_coding']
    Key: gene_synonym, Value: ['spike glycoprotein']
    Key: locus_tag, Value: ['GU280_gp02']
    Key: source, Value: ['RefSeq']

[21562:25384](+)
21562
25384


In [31]:
alpha_sample_accessions = []
for i in range(shortest):
  alpha_sample_accessions.append(alpha_sample.iloc[i]['Accession'])

In [33]:
!ncbi-acc-download --help

usage: ncbi-acc-download [-h] [-m {nucleotide,protein}] [--api-key API_KEY]
                         [-F {fasta,genbank,featuretable,gff3}] [-o OUT]
                         [-p PREFIX] [-g RANGE] [--url] [-v]
                         NCBI-accession [NCBI-accession ...]

positional arguments:
  NCBI-accession

optional arguments:
  -h, --help            show this help message and exit
  -m {nucleotide,protein}, --molecule {nucleotide,protein}
                        Molecule type to download. Default: nucleotide
  --api-key API_KEY     Specify USER NCBI API key. More info at
                        https://www.ncbi.nlm.nih.gov/books/NBK25497/
  -F {fasta,genbank,featuretable,gff3}, --format {fasta,genbank,featuretable,gff3}
                        File format to download nucleotide sequences in.
                        Default: genbank
  -o OUT, --out OUT     Single filename to use for the combined output.
  -p PREFIX, --prefix PREFIX
                        Filename prefix to use for 

In [40]:
# Test NCBI accession download
test = alpha_sample_accessions[0]
print(test)

MZ375649.1


In [39]:
!ncbi-acc-download --format fasta $test

In [49]:
for i in range(3):
  subprocess.run(["ncbi-acc-download", "--format", "fasta", alpha_sample_accessions[i]])

In [64]:
# SeqIO
record = SeqIO.read("MZ375649.1.fa", "fasta")
print(record)  # first record
print(len(record.seq))

ID: MZ375649.1
Name: MZ375649.1
Description: MZ375649.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/CO-CDC-MMB08714138/2021, complete genome
Number of features: 0
Seq('TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...TGT')
29781


In [65]:
record = SeqIO.read("MZ375649.1.fa", "fasta")
print(record)  # first record
print(len(record.seq))

ID: MZ375649.1
Name: MZ375649.1
Description: MZ375649.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/CO-CDC-MMB08714138/2021, complete genome
Number of features: 0
Seq('TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...TGT')
29781


In [66]:
record = SeqIO.read("MZ002964.1.fa", "fasta")
print(record)  # first record
print(len(record.seq))

ID: MZ002964.1
Name: MZ002964.1
Description: MZ002964.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/NJ-CDC-LC0038715/2021, complete genome
Number of features: 0
Seq('TAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCA...TGA')
29695
