In [3]:
import argparse
from Bio import SeqIO
import pandas as pd
# Running model
from sklearn.linear_model import LogisticRegression
# Loading model
import pickle
# Required for listing files
from os import listdir
from os.path import isfile, join
import os
# Loading/running model:
import tensorflow as tf

import scipy
from scipy import stats

import numpy as np

# Making the input robust to various 'boolean' inputs:
def str2bool(v):
    if isinstance(v, bool):
       return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

# Converting the ab1 to the fasta:
def abi_to_seq(input_ab1_file):
    # Opening the abi file:
    test_record = SeqIO.read(input_ab1_file, 'abi')
    # Reading in the sequence:
    letters = test_record.annotations['abif_raw']['PBAS1']
    return letters
###COMBINE THE FOLLOWING TWO FUNCTIONS??
# Listing all ab1 files in directory
def listing_ab1_files(input_dir):
    # Getting all of the ab1 files:
    onlyfiles = [f for f in listdir(input_dir) if
                 isfile(join(input_dir, f)) if '.ab1' in f]
    # Throwing on the directory to the front of the ab1 filenames:
    outputlfiles = ['%s' % input_dir + each_file for each_file in onlyfiles]
    return outputlfiles, onlyfiles

# Listing all ab1 files in directory
def listing_temp_files(input_dir):
    # Getting all of the temp files:
    onlyfiles = [f for f in listdir(input_dir) if
                 isfile(join(input_dir, f)) if 'temp.ab1.conv.' in f]
    # Throwing on the directory to the front of the ab1 filenames:
    return onlyfiles

# Converting sequence to fasta file:
def seq_to_fa(input_name, input_seq, sequence_name = None):
    # Getting the sequence name for the fasta:
    if sequence_name == None:
        sequence_name = input_name

    # Generating the fasta file:
    final_filename = input_name.rsplit('.', 1)[0] + '.fa'
    final_file = open(final_filename, 'w')
    final_file.write('> %s\n' % sequence_name)
    final_file.write(input_seq + '\n')
    final_file.close()
    return final_file

# Converting ab1 file to prediction input:
def abi_to_df(input_seqio_record):
    # Reading in the abi files:
    input_seqio_record = SeqIO.read(input_seqio_record, 'abi')

    # Getting the list of letters and their locations:
    locations = list(input_seqio_record.annotations['abif_raw']['PLOC1'])
    letters = list(input_seqio_record.annotations['abif_raw']['PBAS1'])

    # Converting to df:
    letter_loc_df = pd.DataFrame()
    letter_loc_df['Locations'] = locations
    letter_loc_df['Letters'] = letters

    # Different df with all the waveform data:
    peak_df = pd.DataFrame()
    peak_df['g_let'] = list(input_seqio_record.annotations['abif_raw']['DATA9'])
    peak_df['a_let'] = list(input_seqio_record.annotations['abif_raw']['DATA10'])
    peak_df['t_let'] = list(input_seqio_record.annotations['abif_raw']['DATA11'])
    peak_df['c_let'] = list(input_seqio_record.annotations['abif_raw']['DATA12'])

    # Making the indeces play nicely and deleting the other column:
    peak_df['index_plus_one'] = peak_df.index + 1
    peak_df.index = peak_df['index_plus_one']
    letter_loc_df.index = letter_loc_df['Locations']
    letter_loc_df.drop('Locations', inplace=True, axis=1)

    # combining the dfs:
    combined_df = letter_loc_df.join(peak_df, how='inner')
    return combined_df

# Adding the previous and the following base to the df:
def surrounding_bases(input_df):
    previous_letter_value_df = input_df.shift(1)
    previous_letter_value_df.dropna(inplace=True)
    previous_letter_value_df.rename({'a_let':'prev_a','c_let':'prev_c','t_let':'prev_t','g_let':'prev_g'}, inplace=True, axis=1)

    following_letter_value_df = input_df.shift(-1)
    following_letter_value_df.dropna(inplace=True)
    following_letter_value_df.rename({'a_let':'next_a','c_let':'next_c','t_let':'next_t','g_let':'next_g'}, inplace=True, axis=1)

    current_previous_following_df = pd.concat([input_df, previous_letter_value_df, following_letter_value_df], axis=1, join='inner')
    return current_previous_following_df

def ab1_to_predicted_sequence(input_ab1_file, model, actual_ab1=True):
    # Loading in and parsing input df:
    if actual_ab1 == True:
        test_df = abi_to_df(input_ab1_file)
    else:
        test_df = input_ab1_file
    test_letter_value_df = test_df[['a_let', 'c_let', 't_let', 'g_let']]
    test_full_info_df = surrounding_bases(test_letter_value_df)

    # Using model to predict sequence:
    predicted_probs_df = pd.DataFrame(model.predict(X=test_full_info_df),
                                      columns=['Prediction'])

    # Acquiring and returning sequence:
    sequence = ''.join(list(predicted_probs_df['Prediction']))
    return sequence

# This combines a peak df with a full record
def peak_calling_df(input_df, input_seqio_record):
    input_df['peak_no_peak'] = [1] * input_df.shape[0]
    input_df.index = input_df.index + 1####MAYBE KEEP THIS IN? MAYBE REMOVE IT?
    first_val = input_df.index[0] - 5
    last_val = input_df.index[-1] + 5
    removed_df = input_df[['peak_no_peak']]
    # Different df with all the waveform data:
    peak_val = pd.DataFrame()
    peak_val['g_let'] = list(input_seqio_record.annotations['abif_raw']['DATA9'])
    peak_val['a_let'] = list(input_seqio_record.annotations['abif_raw']['DATA10'])
    peak_val['t_let'] = list(input_seqio_record.annotations['abif_raw']['DATA11'])
    peak_val['c_let'] = list(input_seqio_record.annotations['abif_raw']['DATA12'])

    peak_val = peak_val.loc[first_val:last_val]
    fin_df = removed_df.merge(peak_val, how='outer', left_index=True, right_index=True)
    zero = fin_df[fin_df['peak_no_peak'] !=1]
    zero['peak_no_peak'] = [0] * zero.shape[0]
    nonzero = fin_df[fin_df['peak_no_peak'] ==1]
    fin_df = zero.append(nonzero)
    fin_df.sort_index(inplace=True)
    return fin_df

def slope(inp_df):
    only_letters = inp_df[['g_let', 'a_let', 't_let', 'c_let']]
    slope_before = only_letters.diff(1, axis=0)
    slope_before.columns = ['slope_g_after', 'slope_a_after', 'slope_t_after', 'slope_c_after']
    slope_after = only_letters.diff(-1, axis=0)
    slope_after.columns = ['slope_g_before', 'slope_a_before', 'slope_t_before', 'slope_c_before']

    final = only_letters.join(slope_before)
    final = final.join(slope_after)
    final = final.join(inp_df[['peak_no_peak']])
    return final

def normalizing(inp_df):
    all_peak_places = inp_df[inp_df['peak_no_peak'] == 1]
    all_peak_places_vals = all_peak_places[['g_let', 'a_let', 't_let', 'c_let']]
    all_max_peaks = list(all_peak_places_vals.max(axis=1))
    trimmed_mean = scipy.stats.trim_mean(all_max_peaks, proportiontocut=0.1)
    inp_df = inp_df / trimmed_mean
    inp_df['peak_no_peak'] = inp_df['peak_no_peak'] * trimmed_mean
    inp_df['peak_no_peak'] = inp_df['peak_no_peak'].astype(int)
    return inp_df

def reshaping_the_df(inp_df, first_dim, second_dim, third_dim):
    y_val_train = np.array(inp_df['peak_no_peak'])
    inp_df = inp_df.iloc[:,:-1]
    x_val_train = inp_df.values.reshape((first_dim,second_dim,third_dim))
    return x_val_train, y_val_train

In [2]:
peak_model = tf.keras.models.load_model('model.h5')

W1006 00:28:11.386250 4580853184 deprecation.py:506] From /Users/alexandredaly/anaconda/envs/python3/lib/python3.5/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1006 00:28:11.387537 4580853184 deprecation.py:506] From /Users/alexandredaly/anaconda/envs/python3/lib/python3.5/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1006 00:28:11.390460 4580853184 deprecation.py:506] From /Users/alexandredaly/anaconda/envs/python3/lib/python3.5/site-packages/tensorflow/python/ops/init_

In [4]:
peak_model = tf.keras.models.load_model('model.h5')
nucleotide_model = pickle.load(open('log_reg_default_million.sav', 'rb'))


In [6]:
current_record = SeqIO.read('peak2_clone2_topo_maxi.SP6.ab1', 'abi')


In [8]:
current_training_df = abi_to_df('peak2_clone2_topo_maxi.SP6.ab1')

In [10]:
current_training_df['saving_og'] = current_training_df['Letters']

In [11]:
fin_training = peak_calling_df(current_training_df, current_record)
fin_training = normalizing(fin_training)
fin_training = slope(fin_training)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return np.mean(atmp[sl], axis=axis)


In [12]:
fin_training

Unnamed: 0,g_let,a_let,t_let,c_let,slope_g_after,slope_a_after,slope_t_after,slope_c_after,slope_g_before,slope_a_before,slope_t_before,slope_c_before,peak_no_peak
60,0.125532,0.282946,0.077710,0.231139,,,,,-0.007970,-0.037859,-0.003985,-0.027896,0
61,0.133503,0.320805,0.081696,0.259035,0.007970,0.037859,0.003985,0.027896,-0.007970,-0.031881,-0.005978,-0.027896,0
62,0.141473,0.352686,0.087673,0.286931,0.007970,0.031881,0.005978,0.027896,-0.009963,-0.029889,-0.005978,-0.029889,0
63,0.151436,0.382575,0.093651,0.316820,0.009963,0.029889,0.005978,0.029889,-0.009963,-0.025903,-0.007970,-0.027896,0
64,0.161399,0.408478,0.101621,0.344716,0.009963,0.025903,0.007970,0.027896,-0.011955,-0.025903,-0.005978,-0.027896,0
65,0.173354,0.434381,0.107599,0.372612,0.011955,0.025903,0.005978,0.027896,-0.009963,-0.021918,-0.003985,-0.027896,1
66,0.183317,0.456300,0.111584,0.400508,0.009963,0.021918,0.003985,0.027896,-0.011955,-0.023911,-0.003985,-0.029889,0
67,0.195272,0.480211,0.115569,0.430396,0.011955,0.023911,0.003985,0.029889,-0.009963,-0.019926,-0.005978,-0.033874,0
68,0.205235,0.500136,0.121547,0.464270,0.009963,0.019926,0.005978,0.033874,-0.013948,-0.025903,-0.007970,-0.043837,0
69,0.219183,0.526040,0.129517,0.508107,0.013948,0.025903,0.007970,0.043837,-0.017933,-0.025903,-0.009963,-0.057785,0


In [13]:
x_pred_peaks, y_pred_peaks = reshaping_the_df(fin_training,
                                              fin_training.shape[0],
                                              1, 12)

In [14]:
# Predicting
final_predicted_y = peak_model.predict(x_pred_peaks)
# Getting the predicted values:
predicted_val = []
for idx, item in enumerate(final_predicted_y):
    if item[1] > 0.5:
        predicted_val.append(1)
    else:
        predicted_val.append(0)
# Appending them to the dataframe:
fin_training['predicted_peaks'] = predicted_val

In [16]:
fin_training[fin_training['predicted_peaks']==1]

Unnamed: 0,g_let,a_let,t_let,c_let,slope_g_after,slope_a_after,slope_t_after,slope_c_after,slope_g_before,slope_a_before,slope_t_before,slope_c_before,peak_no_peak,predicted_peaks
180,0.063762,0.181324,0.958429,0.003985,0.033874,-0.003985,0.001993,0.000000,-0.055792,0.007970,0.007970,-0.001993,0,1
192,1.452588,0.067748,0.442352,0.000000,0.009963,-0.007970,-0.087673,0.000000,0.035866,0.005978,0.079703,0.000000,1,1
204,0.645594,0.157413,0.097636,0.000000,0.003985,0.005978,0.007970,0.000000,0.009963,-0.005978,-0.007970,0.000000,0,1
219,0.000000,0.300879,0.599765,0.029889,0.000000,0.049814,0.033874,0.027896,0.000000,-0.061770,0.005978,-0.047822,1,1
234,0.000000,0.047822,0.157413,1.372885,0.000000,-0.029889,0.000000,0.005978,0.000000,0.021918,0.007970,0.011955,0,1
247,0.637624,0.037859,0.023911,0.123540,0.017933,0.009963,0.000000,-0.041844,0.011955,-0.013948,0.005978,0.021918,0,1
261,0.276968,0.946473,0.000000,0.115569,0.091658,0.019926,0.000000,-0.009963,-0.097636,0.025903,0.000000,0.000000,1,1
280,0.000000,0.000000,0.000000,1.030162,0.000000,0.000000,0.000000,0.039852,0.000000,0.000000,0.000000,0.015941,0,1
309,0.930533,0.000000,0.000000,0.087673,0.033874,0.000000,0.000000,-0.099629,0.001993,0.000000,0.000000,0.061770,0,1
322,0.795038,0.000000,0.000000,0.000000,0.021918,0.000000,0.000000,0.000000,0.001993,0.000000,0.000000,0.000000,0,1


In [33]:
just_peaks = fin_training[fin_training['predicted_peaks']==1]


In [34]:
sequence = ab1_to_predicted_sequence_fixed(just_peaks, nucleotide_model, actual_ab1=False)

In [35]:
sequence

'GGTCGACGGATCCCTAGTAACGGCCGCCAGTGTGCTGGAATTCGCCCTTGGTGCTTGCTCTATGGATTTTAATAATTGTTAGATTTATGAAACATGATGTCTATAAAATAATTTTACCAAAGAATATATTTAAGTCTATTTATAGAATGATTAATTATATGGTAATTTTCTATATTTATACAGTTATGGATATGCCTGAATAGTTAGGTTAATGTTTTACAATATTAGCTATTTAAGTGCTTTTATAATACTGATCTGTTGAAATTAATTGACGTTTTTTATTTCTTTCAGGAACTCTGATTCATAACTGAAATGCTGTTAATGCATATTTTTCTTTGCATTTTCTGTTTGTAATCCACAGCTTGATCTGTCACTAATGTTCTTAATGTACTCTACAGTTTGAATTCGGTTCCTGACTATGTAAAATATCTCTAACCTTTGTACAAGCAATTTGCCTTTGACATCATTAAATTATATTATATGAAGTAGAAGATGTTTATTATCAGCTAAGTAATAAAGTAGATATTTGTAATAGGACATGCATTAGACCAATCCAAAAGAATCTAAAATGTTTCACTTTATTGAACCTTTTGATGCAATAGCCTATCGCTGTTTCTTAGTGCTTCCTAAAACAGTTTTTTATCGCTGAAAGATGGTCAGATAGAAGCCTGATACTTCAATCTTATTAATTTCAAAGTGGATCTTAACATAGCTGTTCCTGTTAACCTACTCATTTTTTCGAAAACTGAATAAGAGTGTACAGTCCTCATAATAATGAATCGATTCTCACTATCTCAAATTATTTTTTTAACGCATCTTAACTTTGTTCTGCTTGCCACCTCCAGCCTTTAGCCAAAACT'

In [32]:
def ab1_to_predicted_sequence_fixed(input_ab1_file, model, actual_ab1=True):
    # Loading in and parsing input df:
    if actual_ab1 == True:
        test_df = abi_to_df(input_ab1_file)
    else:
        test_df = input_ab1_file
    test_letter_value_df = test_df[['a_let', 'c_let', 't_let', 'g_let']]
    test_letter_value_df = test_letter_value_df * 1000
    test_full_info_df = surrounding_bases(test_letter_value_df)

    # Using model to predict sequence:
    predicted_probs_df = pd.DataFrame(model.predict(X=test_full_info_df),
                                      columns=['Prediction'])

    # Acquiring and returning sequence:
    sequence = ''.join(list(predicted_probs_df['Prediction']))
    return sequence

In [30]:
abi_to_df('peak2_clone2_topo_maxi.SP6.ab1')

Unnamed: 0,Letters,g_let,a_let,t_let,c_let,index_plus_one
64,N,76,192,47,159,64
77,N,252,401,147,571,77
89,C,450,826,372,993,89
98,A,332,964,549,660,98
110,G,644,1018,920,285,110
125,A,129,1287,241,448,125
137,C,143,830,462,436,137
150,A,317,1045,336,24,150
164,C,121,214,54,410,164
177,T,0,99,416,15,177


In [31]:
just_peaks

Unnamed: 0,g_let,a_let,t_let,c_let,slope_g_after,slope_a_after,slope_t_after,slope_c_after,slope_g_before,slope_a_before,slope_t_before,slope_c_before,peak_no_peak,predicted_peaks
180,0.063762,0.181324,0.958429,0.003985,0.033874,-0.003985,0.001993,0.000000,-0.055792,0.007970,0.007970,-0.001993,0,1
192,1.452588,0.067748,0.442352,0.000000,0.009963,-0.007970,-0.087673,0.000000,0.035866,0.005978,0.079703,0.000000,1,1
204,0.645594,0.157413,0.097636,0.000000,0.003985,0.005978,0.007970,0.000000,0.009963,-0.005978,-0.007970,0.000000,0,1
219,0.000000,0.300879,0.599765,0.029889,0.000000,0.049814,0.033874,0.027896,0.000000,-0.061770,0.005978,-0.047822,1,1
234,0.000000,0.047822,0.157413,1.372885,0.000000,-0.029889,0.000000,0.005978,0.000000,0.021918,0.007970,0.011955,0,1
247,0.637624,0.037859,0.023911,0.123540,0.017933,0.009963,0.000000,-0.041844,0.011955,-0.013948,0.005978,0.021918,0,1
261,0.276968,0.946473,0.000000,0.115569,0.091658,0.019926,0.000000,-0.009963,-0.097636,0.025903,0.000000,0.000000,1,1
280,0.000000,0.000000,0.000000,1.030162,0.000000,0.000000,0.000000,0.039852,0.000000,0.000000,0.000000,0.015941,0,1
309,0.930533,0.000000,0.000000,0.087673,0.033874,0.000000,0.000000,-0.099629,0.001993,0.000000,0.000000,0.061770,0,1
322,0.795038,0.000000,0.000000,0.000000,0.021918,0.000000,0.000000,0.000000,0.001993,0.000000,0.000000,0.000000,0,1
