In [None]:
# the strains ncbi page and fasta file must be manually uploaded
# currenlty, the following code must be run for each strain separately
strain = 'NC_045512.2'
feature_types = ('gene', 'mat_peptide')
gene_length_max = 5000 # temporary max limit on gene sequence length

ncbi_page_path = './drive/MyDrive/Colab Notebooks/ncbi_pages/' + strain + '.txt'
fasta_path = './drive/MyDrive/Colab Notebooks/fasta_files/' + strain + '.fasta'
pickles_path = './drive/MyDrive/Colab Notebooks/pickles/'
pickles_file = strain + '.pickle'

In [None]:
try:
    from Bio import SeqIO
except ImportError as e:
    !pip install biopython
    from Bio import SeqIO
import pickle
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
features = []
with open(ncbi_page_path) as file:
  found = False
  feature_lines = []
  for line in file:
    line = line.strip()
    if found == True and line.startswith('/'):
      feature_lines.append(line)
      continue
    elif found == True:
      features.append(feature_lines)
      feature_lines = []
      found = False
    for ftype in feature_types:
      if line.startswith(ftype):
        found = True
        feature_lines.append(line)
features[:4]

[['gene            266..21555',
  '/gene="ORF1ab"',
  '/locus_tag="GU280_gp01"',
  '/db_xref="GeneID:43740578"'],
 ['mat_peptide     266..805',
  '/gene="ORF1ab"',
  '/locus_tag="GU280_gp01"',
  '/product="leader protein"',
  '/note="nsp1; produced by both pp1a and pp1ab"',
  '/protein_id="YP_009725297.1"'],
 ['mat_peptide     806..2719',
  '/gene="ORF1ab"',
  '/locus_tag="GU280_gp01"',
  '/product="nsp2"',
  '/note="produced by both pp1a and pp1ab"',
  '/protein_id="YP_009725298.1"'],
 ['mat_peptide     2720..8554',
  '/gene="ORF1ab"',
  '/locus_tag="GU280_gp01"',
  '/product="nsp3"',
  '/note="former nsp1; conserved domains are: N-terminal']]

In [None]:
dna_features = []
feature = {}
for feature_lines in features:
  top_line = " ".join(feature_lines[0].split())
  feature_type, cords_text = top_line.split(' ')
  # TODO: handle join cases: mat_peptide join(13442..13468,13468..16236)
  coords = cords_text.split('..')
  if len(coords) != 2:
    continue
  start, end = coords
  feature['type'] = feature_type
  feature['start'] = int(start)
  feature['end'] = int(end)
  for line in feature_lines[1:]:
    key, val = line.split('=')
    key = key[1:]
    val = val.strip('",')
    feature[key] = val
  dna_features.append(feature)
  feature = {}
dna_features[:4]

[{'type': 'gene',
  'start': 266,
  'end': 21555,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'db_xref': 'GeneID:43740578'},
 {'type': 'mat_peptide',
  'start': 266,
  'end': 805,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'product': 'leader protein',
  'note': 'nsp1; produced by both pp1a and pp1ab',
  'protein_id': 'YP_009725297.1'},
 {'type': 'mat_peptide',
  'start': 806,
  'end': 2719,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'product': 'nsp2',
  'note': 'produced by both pp1a and pp1ab',
  'protein_id': 'YP_009725298.1'},
 {'type': 'mat_peptide',
  'start': 2720,
  'end': 8554,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'product': 'nsp3',
  'note': 'former nsp1; conserved domains are: N-terminal'}]

In [None]:
fasta = SeqIO.parse(fasta_path,"fasta")
records = []
for record in fasta.records:
  records.append(record)
dna = records[0].seq
dna

Seq('ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [None]:
for feature in dna_features:
  start = feature['start'] - 1
  end = feature['end']
  if end - start < gene_length_max:
    coding_region = dna[start:end]
    feature['nucleotides'] = coding_region
    feature['translation'] = coding_region.translate()
dna_features[:4]

[{'type': 'gene',
  'start': 266,
  'end': 21555,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'db_xref': 'GeneID:43740578'},
 {'type': 'mat_peptide',
  'start': 266,
  'end': 805,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'product': 'leader protein',
  'note': 'nsp1; produced by both pp1a and pp1ab',
  'protein_id': 'YP_009725297.1',
  'nucleotides': Seq('ATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTG...GGG'),
  'amino_acids': Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...NGG'),
  'translation': Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...NGG')},
 {'type': 'mat_peptide',
  'start': 806,
  'end': 2719,
  'gene': 'ORF1ab',
  'locus_tag': 'GU280_gp01',
  'product': 'nsp2',
  'note': 'produced by both pp1a and pp1ab',
  'protein_id': 'YP_009725298.1',
  'nucleotides': Seq('GCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTT...GGT'),
  'translation': Seq('AYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKRGVYCCREH...KGG')},
 {'type': '

In [None]:
with open(pickles_path + pickles_file, 'wb') as file:
  pickle.dump(dna_features, file)
os.listdir(pickles_path)

['.ipynb_checkpoints', 'NC_045512.2.pickle']