In [1]:
%matplotlib inline

In [2]:
from Bio import SeqIO
from Bio import SeqFeature
from Bio.SeqRecord import SeqRecord

import matplotlib.pyplot as plt

import Levenshtein
import numpy as np

In [None]:
genome_old = list(SeqIO.parse('../Data/Genomes/fa1090.gb', 'genbank'))[0]
genome_old

In [None]:
genome_new = SeqRecord(genome_old.seq)
genome_new

In [None]:
new_features = []
for loc, feature in enumerate(genome_old.features[:]):
    if feature.type in ['tRNA', 'rRNA', 'gene', 'source']:
        found = []
        for loc2, feature2 in enumerate(genome_old.features[:]):
            if feature2.type == 'CDS':
                if feature2.location.start == feature.location.start and feature2.location.end == feature.location.end:
                    found.append(feature2)
        if len(found) == 0:
            new_features.append(feature)
print(len(new_features), len(genome_old.features))

In [None]:
genome_new.features = new_features

In [None]:
potential_proteins = list(SeqIO.parse('../Data/fa1090_new_annotations.fasta', "fasta"))
print(len(potential_proteins))
print('####')
print(potential_proteins[0])

In [None]:
frame_dictionary = {}
frame_dictionary['frame_1_plus'] = genome_new.seq.translate()
frame_dictionary['frame_2_plus'] = genome_new.seq[1:].translate()
frame_dictionary['frame_3_plus'] = genome_new.seq[2:].translate()
frame_dictionary['frame_1_minus'] = genome_new.seq.reverse_complement().translate()
frame_dictionary['frame_2_minus'] = genome_new.seq[:-1].reverse_complement().translate()
frame_dictionary['frame_3_minus']= genome_new.seq[:-2].reverse_complement().translate()

In [None]:
problem_children = []
genome_seq_length = len(genome_new.seq)
acceptable_mismatches = 2

for numb, protein in enumerate(potential_proteins[:]):
    print(numb, protein.id)
    length_of_search_seq = len(protein.seq)
    protein_seq = str(protein.seq)
    found_stats = []
    for frame in frame_dictionary:
        tempy_list = []
        for position in range(len(frame_dictionary[frame])-length_of_search_seq):
            temp_genome = str(frame_dictionary[frame])[position:position+length_of_search_seq]
            tempy_list.append(Levenshtein.hamming(temp_genome, protein_seq))
        found_stats.append((frame, np.argmin(tempy_list), tempy_list[np.argmin(tempy_list)]))
    
    best = min([i[2] for i in found_stats])
    if best > acceptable_mismatches:
        problem_children.append(protein)
        continue
        
    for frame, position_of_hit, hit in found_stats:
        if hit <= acceptable_mismatches:
            if frame == 'frame_1_plus':
                my_start_pos = position_of_hit*3
                my_end_pos = position_of_hit*3 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_2_plus':
                my_start_pos = position_of_hit*3 + 1
                my_end_pos = position_of_hit*3 + 1 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_3_plus':
                my_start_pos = position_of_hit*3 + 2
                my_end_pos = position_of_hit*3 + 2 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_1_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 3
                my_end_pos = genome_seq_length - position_of_hit*3
                my_feature_strand = -1
            elif frame == 'frame_2_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 4
                my_end_pos = genome_seq_length - position_of_hit*3 - 1
                my_feature_strand = -1
            elif frame == 'frame_3_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 5
                my_end_pos = genome_seq_length - position_of_hit*3 - 2
                my_feature_strand = -1
            my_feature_location = SeqFeature.FeatureLocation(int(my_start_pos), int(my_end_pos), strand=my_feature_strand)
            my_feature_locus = protein.id
            my_feature_product = protein.description
            my_feature_notes = 'Exact hit or close to it'
            my_feature_qualifiers = {'codon_start': ['1'], 'locus_tag': [my_feature_locus],
             'note': [my_feature_notes],
             'product': [my_feature_product],
             'transl_table': ['11']}

            # 4. Create a SeqFeature
            my_feature_gene = SeqFeature.SeqFeature(my_feature_location, type='gene', qualifiers = my_feature_qualifiers)
            my_feature_cds = SeqFeature.SeqFeature(my_feature_location, type='CDS', qualifiers = my_feature_qualifiers)

            # 5. Append your newly created SeqFeature to your SeqRecord
            genome_new.features.append(my_feature_gene)
            genome_new.features.append(my_feature_cds)

In [None]:
print(len(problem_children), len(genome_new.features), len(genome_old.features))

In [None]:
output_handle = open("temp_new_fa1090.gb", "w")
SeqIO.write(genome_new, output_handle, "genbank")
output_handle.close()

In [None]:
genome_seq_length = len(genome_new.seq)
acceptable_mismatches = 2

still_fucked = []

for numb, protein in enumerate(problem_children[:]):
    print(numb, protein.id)
    length_of_search_seq = len(protein.seq)
    half_length = int(length_of_search_seq/2.)
    protein_seq = str(protein.seq)
    found_stats_all = []
    found_stats_first_half = []
    found_stats_second_half = []
    for frame in frame_dictionary:
        tempy_list_all = []
        tempy_list_first_half = []
        tempy_list_second_half = []
        for position in range(len(frame_dictionary[frame])-length_of_search_seq):
            temp_genome = str(frame_dictionary[frame])[position:position+length_of_search_seq]
            tempy_list_all.append(Levenshtein.hamming(temp_genome, protein_seq))
            tempy_list_first_half.append(Levenshtein.hamming(temp_genome[:half_length], protein_seq[:half_length]))
            tempy_list_second_half.append(Levenshtein.hamming(temp_genome[half_length:], protein_seq[half_length:]))
        found_stats_all.append((frame, np.argmin(tempy_list_all), tempy_list_all[np.argmin(tempy_list_all)]))
        found_stats_first_half.append((frame, np.argmin(tempy_list_first_half), tempy_list_first_half[np.argmin(tempy_list_first_half)]))
        found_stats_second_half.append((frame, np.argmin(tempy_list_second_half), tempy_list_second_half[np.argmin(tempy_list_second_half)]))
#     print(found_stats_all)
#     print(found_stats_first_half)
#     print(found_stats_second_half)
    best_first_half = min([i[2] for i in found_stats_first_half])
    best_second_half = min([i[2] for i in found_stats_second_half])
    if min([best_first_half, best_second_half]) > acceptable_mismatches:
        still_fucked.append(protein)
        continue
        
    for frame, position_of_hit, hit in found_stats_first_half:
        if hit <= acceptable_mismatches:
            if frame == 'frame_1_plus':
                my_start_pos = position_of_hit*3
                my_end_pos = position_of_hit*3 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_2_plus':
                my_start_pos = position_of_hit*3 + 1
                my_end_pos = position_of_hit*3 + 1 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_3_plus':
                my_start_pos = position_of_hit*3 + 2
                my_end_pos = position_of_hit*3 + 2 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_1_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 3
                my_end_pos = genome_seq_length - position_of_hit*3
                my_feature_strand = -1
            elif frame == 'frame_2_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 4
                my_end_pos = genome_seq_length - position_of_hit*3 - 1
                my_feature_strand = -1
            elif frame == 'frame_3_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 5
                my_end_pos = genome_seq_length - position_of_hit*3 - 2
                my_feature_strand = -1
            my_feature_location = SeqFeature.FeatureLocation(int(my_start_pos), int(my_end_pos), strand=my_feature_strand)
            my_feature_locus = protein.id
            my_feature_product = protein.description
            my_feature_notes = 'Apparent frame shift, decent match only at N-terminus'
            my_feature_qualifiers = {'codon_start': ['1'], 'locus_tag': [my_feature_locus],
             'note': [my_feature_notes],
             'product': [my_feature_product],
             'transl_table': ['11']}

            # 4. Create a SeqFeature
            my_feature_gene = SeqFeature.SeqFeature(my_feature_location, type='gene', qualifiers = my_feature_qualifiers)
            my_feature_cds = SeqFeature.SeqFeature(my_feature_location, type='CDS', qualifiers = my_feature_qualifiers)

            # 5. Append your newly created SeqFeature to your SeqRecord
            genome_new.features.append(my_feature_gene)
            genome_new.features.append(my_feature_cds)
            
    for frame, position_of_hit, hit in found_stats_second_half:
        if hit <= acceptable_mismatches:
            if frame == 'frame_1_plus':
                my_start_pos = position_of_hit*3
                my_end_pos = position_of_hit*3 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_2_plus':
                my_start_pos = position_of_hit*3 + 1
                my_end_pos = position_of_hit*3 + 1 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_3_plus':
                my_start_pos = position_of_hit*3 + 2
                my_end_pos = position_of_hit*3 + 2 + length_of_search_seq*3 + 3
                my_feature_strand = 1
            elif frame == 'frame_1_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 3
                my_end_pos = genome_seq_length - position_of_hit*3
                my_feature_strand = -1
            elif frame == 'frame_2_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 4
                my_end_pos = genome_seq_length - position_of_hit*3 - 1
                my_feature_strand = -1
            elif frame == 'frame_3_minus':
                my_start_pos = genome_seq_length - position_of_hit*3 - length_of_search_seq*3 - 5
                my_end_pos = genome_seq_length - position_of_hit*3 - 2
                my_feature_strand = -1
            my_feature_location = SeqFeature.FeatureLocation(int(my_start_pos), int(my_end_pos), strand=my_feature_strand)
            my_feature_locus = protein.id
            my_feature_product = protein.description
            my_feature_notes = 'Apparent frame shift, decent match only at C-terminus'
            my_feature_qualifiers = {'codon_start': ['1'], 'locus_tag': [my_feature_locus],
             'note': [my_feature_notes],
             'product': [my_feature_product],
             'transl_table': ['11']}

            # 4. Create a SeqFeature
            my_feature_gene = SeqFeature.SeqFeature(my_feature_location, type='gene', qualifiers = my_feature_qualifiers)
            my_feature_cds = SeqFeature.SeqFeature(my_feature_location, type='CDS', qualifiers = my_feature_qualifiers)

            # 5. Append your newly created SeqFeature to your SeqRecord
            genome_new.features.append(my_feature_gene)
            genome_new.features.append(my_feature_cds)

In [None]:
print(len(still_fucked), len(genome_new.features), len(genome_old.features))

In [None]:
output_handle = open("temp_new_fa1090.gb", "w")
SeqIO.write(genome_new, output_handle, "genbank")
output_handle.close()

# Annotations for sRNAs!

In [3]:
genome_old = list(SeqIO.parse('../Data/Genomes/fa1090.gb', 'genbank'))[0]
genome_old

SeqRecord(seq=Seq('ATAAATTTTTGCACGGGTTGTGGATAAAATATCGGCGAGTCGGTATAATCGGTT...TGG', IUPACAmbiguousDNA()), id='NC_002946.2', name='NC_002946', description='Neisseria gonorrhoeae FA 1090 chromosome, complete genome.', dbxrefs=['BioProject:PRJNA57611'])

In [4]:
genome_new = SeqRecord(genome_old.seq)
genome_new

SeqRecord(seq=Seq('ATAAATTTTTGCACGGGTTGTGGATAAAATATCGGCGAGTCGGTATAATCGGTT...TGG', IUPACAmbiguousDNA()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

# Scratch

In [46]:
import pandas as pd
import random

In [55]:
gene_df = pd.read_excel('/Users/adamhockenberry/Downloads/DownregulatedGenesforMeme.xlsx', header=None)
gene_list = list(gene_df[0])
len(gene_list)

10

In [56]:
genome_old = list(SeqIO.parse('../Data/Genomes/fa1090.gb', 'genbank'))[0]

# all_names = [feature.qualifiers['locus_tag'][0] for feature in genome_old.features if feature.type=='gene']
# gene_list = random.sample(all_names, 90)
# all_list = [i for i in all_names if i not in gene_list]

upstream = 200

with open('./5utrs_all_sans_down.fasta', 'w') as outfile:
    for feature in genome_old.features[:]:
        if feature.type == 'gene':
            if feature.qualifiers['locus_tag'][0] not in gene_list:
                if feature.strand == 1:
                    temp_seq = genome_old.seq[feature.location.start-upstream:feature.location.start]
                elif feature.strand == -1:
                    temp_seq = genome_old.seq[feature.location.end:feature.location.end+upstream].reverse_complement()
                if len(temp_seq) == upstream:
                    outfile.write('>{}({})\n'.format(feature.qualifiers['locus_tag'][0], feature.strand))
                    outfile.write('{}\n'.format(str(temp_seq)))

