In [None]:
# import packages
try:
  from Bio import SeqIO
except ImportError as e:
  !pip install biopython
  from Bio import SeqIO
import pickle
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
# configs
gene_length_max = 5000 # temporary max limit on gene sequence length
feature_types = ('CDS', 'mat_peptide')
downloaded_strains = [
  'NC_045512.2',
  'OP733821.1',
  'OK341237.1',
  'OM251163.1',
  'OQ050563.1',
  'MW474188.1',
  'MW243586.1',
  'OL947440.1',
  'OQ253610.1'
]

In [1]:
# the ncbi page text for each strain was downloaded along with their fasta file
# use the files to gather data for each strain and store it to a .pickle file
for strain in downloaded_strains:

  # variables that change for each strain
  ncbi_page_path = './drive/MyDrive/Colab Notebooks/ncbi_pages/' + strain + '.txt'
  fasta_path = './drive/MyDrive/Colab Notebooks/fasta_files/' + strain + '.fasta'
  pickles_path = './drive/MyDrive/Colab Notebooks/pickles/'
  pickles_file = strain + '.pickle'

  # find lines of text with info about a feature
  # put list of lines for each feature into a list
  features = []
  with open(ncbi_page_path) as file:
    found = False
    feature_lines = []
    for line in file:
      line = line.strip()
      if found == True and line.startswith('/'):
        if line.startswith('/translation'):
          continue
        feature_lines.append(line)
        continue
      elif found == True:
        features.append(feature_lines)
        feature_lines = []
        found = False    
      for ftype in feature_types:
        if line.startswith(ftype):
          found = True    
          feature_lines.append(line)

  # go through gathered text lines about features
  # put useful data into a dict structure and put those in a list
  dna_features = []
  feature = {}
  for feature_lines in features:
    top_line = " ".join(feature_lines[0].split())
    line_parts = top_line.split(' ')
    if len(line_parts) != 2:
      continue
    feature_type, cords_text = line_parts
    # TODO: handle join cases: mat_peptide join(13442..13468,13468..16236)
    coords = cords_text.split('..')
    if len(coords) != 2:
      continue
    start, end = coords
    feature['type'] = feature_type
    # TODO: handle greater than cases: '>27123'
    if not start.isdigit() or not end.isdigit():
      continue
    feature['start'] = int(start)
    feature['end'] = int(end)
    for line in feature_lines[1:]:
      key, val = line.split('=')
      key = key[1:]
      val = val.strip('",')
      feature[key] = val
    dna_features.append(feature)
    feature = {}
  
  # use start and end porperties found in text lines
  # to find protein coding sequences from fasta file
  fasta = SeqIO.parse(fasta_path,"fasta")
  records = []
  for record in fasta.records:
    records.append(record)
  dna = records[0].seq
  for feature in dna_features:
    start = feature['start'] - 1
    end = feature['end']
    if end - start < gene_length_max:
      coding_region = dna[start:end]
      feature['nucleotides'] = coding_region
      feature['translation'] = coding_region.translate()

  # store the list of found features in a .pickle file
  with open(pickles_path + pickles_file, 'wb') as file:
    pickle.dump(dna_features, file)

NameError: ignored

In [None]:
os.listdir(pickles_path)

['OP733821.1.pickle',
 'OL947440.1.pickle',
 'OK341237.1.pickle',
 'MW474188.1.pickle',
 'MW243586.1.pickle',
 'OQ050563.1.pickle',
 'OQ253610.1.pickle',
 '.ipynb_checkpoints',
 'NC_045512.2.pickle',
 'OM251163.1.pickle']