In [1]:
"""
Using best performing model from 'JTE607_CNN-DLinstance-ratio_25nt_mixedsort-5percent.ipynb' on transcriptome list from
Yongsheng. The list contains 26 nt from the transcriptome, but model handles 25 nt, so predictions are done on both the first 25
nt and last 25 nt.

"""

import numpy as np
import pickle
import OSU

import tensorflow as tf
np.random.seed(1337)

from tensorflow.keras.models import Sequential, load_model


def one_hot_encode(sequences, max_seq_len=164, mask_val=-1, padding='left'):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],
             'c':[0,1,0,0],
             'g':[0,0,1,0],
             't':[0,0,0,1],
             'n':[0,0,0,0],
            'm':[mask_val,mask_val,mask_val,mask_val]}
    
    # Creat empty matrix
    one_hot_seqs = np.ones([len(sequences), max_seq_len, 4])*mask_val
    
    # Iterate through sequences and one-hot encode
    for i, seq in enumerate(sequences):
        # Truncate
        seq = seq[:max_seq_len].lower()
        # Convert to array
        one_hot_seq = np.array([nuc_d[x] for x in seq])
        # Append to matrix
        if padding=='left':
            one_hot_seqs[i, :len(seq), :] = one_hot_seq
        elif padding=='right':
            one_hot_seqs[i, -len(seq):, :] = one_hot_seq
        else:
            ValueError(f'padding {padding} not recognized')
            
    return one_hot_seqs


In [2]:
"""
Set paths
"""
transcriptome_26nt_file = "/JTE-607/Analysis/CNN_predictions_native/sequences/2022_10_28/Gencode_only_26nt_shifted.fa"
predictions_output_dir = OSU.create_directory("/JTE-607/Analysis/CNN_predictions_native/CNN_25nt_logratio_mixedsort_4120_batchnorm-07_25_2022-6epoch_trial6/2022_10_28/")

models_dir = "/JTE-607/Analysis/parsed_L3_input_RNA_clusterPASRandom_bbmerge_xloose/parsed_L3_cleaved_RNA_multimapping_mincov1_preload_bbmerge_xloose_H1shortN4indel/collapsed/models_25nt_logratio_mixedsort_4120_batchnorm-07_25_2022-6epoch_x10/"


In [3]:
# Load best performing model of all data

all_doses_trial_num = 6
model_to_use = load_model("%s/JTE607_CNN_25nt_6epoch_4col_model_%s.hdf5" % (models_dir, all_doses_trial_num))


[2022-10-28 22:32:46.447 ip-172-31-11-19.us-west-1.compute.internal:27352 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-10-28 22:32:46.469 ip-172-31-11-19.us-west-1.compute.internal:27352 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [4]:
"""
Load transcriptome 26 nt sequences and split into lists with first or last 25 nt out of the 26 nt.
"""

first_25nt_loc_seq_pred_list = []
last_25nt_loc_seq_pred_list = []

curr = 0

with open(transcriptome_26nt_file, "r") as f:
    for line in f:
        curr += 1
        if curr % 5000 == 0:
            print(curr)
        line = line.strip()
        if line[0] == ">":
            curr_location = line[1:]
        else:  # sequence found
            # record only 12.5 uM / DMSO prediction
            first_25nt_pred = model_to_use.predict(one_hot_encode([line[:25]], max_seq_len=25))
            first_25nt_loc_seq_pred_list.append((curr_location, line[:25], first_25nt_pred[0][2]))
            last_25nt_pred = model_to_use.predict(one_hot_encode([line[-25:]], max_seq_len=25))
            last_25nt_loc_seq_pred_list.append((curr_location, line[-25:], last_25nt_pred[0][2]))
        if curr == 1:
            print("passed first")
            
print("len(first_25nt_loc_seq_pred_list) = ", len(first_25nt_loc_seq_pred_list))
print("len(last_25nt_loc_seq_pred_list) = ", len(last_25nt_loc_seq_pred_list))
print(first_25nt_loc_seq_pred_list[:5])
print(last_25nt_loc_seq_pred_list[:5])

passed first
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
len(first_25nt_loc_seq_pred_list) =  95199
len(last_25nt_loc_seq_pred_list) =  95199
[('chr1:14395-14421(-)', 'TAAAGAACTGAGCAGAAACCAACAG', -0.23042855), ('chr1:14399-14425(+)', 'TTGGTTTCTGCTCAGTTCTTTATTG', -0.69821537), ('chr1:20490-20516(-)', 'CCCTCTGGCCTTTAAAAATTGTCTG', 0.20700222), ('chr1:20545-20571(-)', 'AATGTTGGGCTTCACATTTGTTCCT', 0.4437983), ('chr1:24203-24229(-)', 'ACCGTGCACAACCATTGATTTGAGT', -0.29061952)]
[('chr1:14395-14421(-)', 'AAAGAACTGAGCAGAAACCAACAGT', -0.17156094), ('chr1:14399-14425(+)', 'TGGTTTCTGCTCAGTTCTTTATTGA', 0.28315383), ('chr1:20490-20516(-)', 'CCTCTGGCCTTTAAAAATTGTCTGA', 0.06910547), ('chr1:20545-20571(-)', 'ATGTTGGGCTTCACATTTGTTCCTT', 0.18640745), ('chr1:24203-24229(-)', 'CCGTGCACAACCATTGATTTGAGTG', 0.10

In [5]:
"""
Output the predictions
"""

with open(predictions_output_dir + "first_25nt_CNN_predictions.txt", "w") as f:
    f.write("Location\tSequence\tPredicted_12p5uM_DMSO_ratio\n")
    for var in first_25nt_loc_seq_pred_list:
        f.write("\t".join([str(i) for i in var]) + "\n")

with open(predictions_output_dir + "last_25nt_CNN_predictions.txt", "w") as f:
    f.write("Location\tSequence\tPredicted_12p5uM_DMSO_ratio\n")
    for var in last_25nt_loc_seq_pred_list:
        f.write("\t".join([str(i) for i in var]) + "\n")

In [6]:
pickle.dump(first_25nt_loc_seq_pred_list, open(predictions_output_dir + "first_25nt_loc_seq_pred_list.pickle", "wb"))
pickle.dump(last_25nt_loc_seq_pred_list, open(predictions_output_dir + "last_25nt_loc_seq_pred_list.pickle", "wb"))
