In [3]:
# Get the predicted top 5 expressing genes from each species, including yeast 
## 5 genes * 53 species = 265 genes
# Create promoter/terminator combinations, where 
# "promoter" = promoter + 5'UTR; "terminator" = 3'UTR + terminator
## i.e. 265 * 265 = 70,225 combinations

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
import pandas as pd

fpath_predictions = "./predictions/"
fpath_predictions_yeast = "./predictions_YEAST/Saccharomyces_cerevisiae_R64-1-1.Sacce1_MODIFIED.npy"
output_directory = "./figures/"

fpath_list = glob.glob(fpath_predictions + "*")
fpath_list.sort()
fpath_list.append(fpath_predictions_yeast)


# Get list of filepaths to FEATURE ARRAYS (npz files)
fpath_features = "./new-results/inputs/npz_files/"
yeast_npz = "./Saccharomyces_cerevisiae_R64-1-1.Sacce1_MODIFIED.npz"

fpath_features_list = glob.glob(fpath_features + "*")
fpath_features_list.sort()
fpath_features_list.append(yeast_npz)

In [4]:

# Make list of species names
species_names = list()
for file in fpath_list:
    working_name = file.replace("./predictions/", "")
    working_name = working_name.replace("./predictions_YEAST/", "")
    working_name = working_name.replace("_MODIFIED", "")
    # Split names by periods, get abbreviated name
    working_name = working_name.split(".")
    species_names.append(working_name[1])
del working_name

series_list = []
# for i, file in enumerate(fpath_list):
for file in fpath_list:
    data_load = np.load(file, allow_pickle=True)
    data_temp = [array[0] for array in data_load]
    series = pd.Series(data_temp)
    series_list.append(series)
del data_load
del data_temp
del series

# Dataframe containing species names (column) and predictions
dframe = pd.concat(series_list, axis=1)
dframe.columns = species_names

In [5]:
# Get top 5 expressing genes for each species
top_five = {}
for species in dframe.columns:
    top_five[species] = dframe[species].nlargest(n=5)

# Create dataframe with index values of top 5 expressing genes
# new_dframe contains: species (column), and index values for top 5 genes
new_dframe = pd.DataFrame()
for column, top_rows in top_five.items():
    # print((top_five[column]))
    # print(column)
    # print(top_rows.index.map(lambda x: dframe.index.get_loc(x)))
    # list = (top_rows.index.map(lambda x: dframe.index.get_loc(x))).tolist()
    # print(list)
    new_dframe[column] = (top_rows.index.map(lambda x: dframe.index.get_loc(x))).tolist()
print(new_dframe)

   Aaoar1  Amnli1  Ampqui1  Ascim1  Ascni1  Aspalli1  Aspalbe1  Aspara19utr  \
0    2164     479     2717    1554    3076      4223      6080          304   
1    3847    4585     3618    2476    3766      5709       776         6063   
2    4965    2321     4237    4190    4283      2298       744          372   
3    3584    3201     2749    3807    2837      5650      2866          938   
4    3437    4790     6118    3595    3222      4864      3787         5140   

   Aspave1  Aspber1  ...  Spofi1  Ternu1  Terbo2  Tribi1  Tryvi1  Wesor1  \
0     2417     5285  ...     409    2605    6021     179    2401    5212   
1     4690     5508  ...     371    5766    4152    5340    4575    1819   
2     6205      148  ...    2797     655    1083    1098    4697    4518   
3      284      324  ...    3683    2429     866    3003     118    1007   
4     6166     6849  ...    1864    2948     787    4657    1767    4859   

   YarliYB567  YarliYB566  YarliYB420  Sacce1  
0        1647       

In [6]:

# Build lists of replacement terminator onehot sequences, and variables
replace_term_hot = []
replace_term_var = []

for f, name in enumerate(new_dframe.columns):
    working_fpath = (fpath_features_list[f]) # filepath 
    working_genes = (new_dframe[name].tolist()) # gene indices

    working_npz = np.load(working_fpath, allow_pickle=True)

    for i in working_genes:
        if f == 52:
            working_hot = (working_npz["og_hot"])[i]
            working_var = (working_npz["og_var"])[i]
        else:
            working_hot = (working_npz["x_hot"])[i]
            working_var = (working_npz["x_var"])[i]

        # Get last 850 NTs (i.e. 350 UTR3 and 500 term)
        term_hot = working_hot[-850:]
        # Get length_UTR3, GC_UTR3 (index 2 and 4 in feature array)
        term_var = [working_var[2], working_var[4]]

        replace_term_hot.append(term_hot)
        replace_term_var.append(term_var)

    print("Done: " + str(f + 1) + " of 53 species")

print(len(replace_term_hot))

Done: 1 of 53 species
Done: 2 of 53 species
Done: 3 of 53 species
Done: 4 of 53 species
Done: 5 of 53 species
Done: 6 of 53 species
Done: 7 of 53 species
Done: 8 of 53 species
Done: 9 of 53 species
Done: 10 of 53 species
Done: 11 of 53 species
Done: 12 of 53 species
Done: 13 of 53 species
Done: 14 of 53 species
Done: 15 of 53 species
Done: 16 of 53 species
Done: 17 of 53 species
Done: 18 of 53 species
Done: 19 of 53 species
Done: 20 of 53 species
Done: 21 of 53 species
Done: 22 of 53 species
Done: 23 of 53 species
Done: 24 of 53 species
Done: 25 of 53 species
Done: 26 of 53 species
Done: 27 of 53 species
Done: 28 of 53 species
Done: 29 of 53 species
Done: 30 of 53 species
Done: 31 of 53 species
Done: 32 of 53 species
Done: 33 of 53 species
Done: 34 of 53 species
Done: 35 of 53 species
Done: 36 of 53 species
Done: 37 of 53 species
Done: 38 of 53 species
Done: 39 of 53 species
Done: 40 of 53 species
Done: 41 of 53 species
Done: 42 of 53 species
Done: 43 of 53 species
Done: 44 of 53 speci

In [7]:
# NOW SWAP!!!
swapped_hot = []
swapped_var = []

for f, name in enumerate(new_dframe.columns):
    working_fpath = (fpath_features_list[f]) # filepath 
    working_genes = (new_dframe[name].tolist()) # gene indices

    working_npz = np.load(working_fpath, allow_pickle=True)

    for i in working_genes:
        if f == 52:
            working_hot = (working_npz["og_hot"])[i]
            working_var = (working_npz["og_var"])[i]
        else:
            working_hot = (working_npz["x_hot"])[i]
            working_var = (working_npz["x_var"])[i]

            prom = working_hot[:1300] # Current promoter

        # Current promoter + replacement terminator
        for term in replace_term_hot:
            replacement = np.concatenate((prom, term))
            swapped_hot.append(replacement)
        # Change terminator variables (length of 3UTR, GC of 3UTR)
        for term in replace_term_var:
            working_var[2] = term[0]
            working_var[4] = term[1]
            swapped_var.append(working_var)
    
    print("Done: " + str(f + 1) + " of 53 species")

print(len(swapped_hot))

Done: 1 of 53 species
Done: 2 of 53 species
Done: 3 of 53 species
Done: 4 of 53 species
Done: 5 of 53 species
Done: 6 of 53 species
Done: 7 of 53 species
Done: 8 of 53 species
Done: 9 of 53 species
Done: 10 of 53 species
Done: 11 of 53 species
Done: 12 of 53 species
Done: 13 of 53 species
Done: 14 of 53 species
Done: 15 of 53 species
Done: 16 of 53 species
Done: 17 of 53 species
Done: 18 of 53 species
Done: 19 of 53 species
Done: 20 of 53 species
Done: 21 of 53 species
Done: 22 of 53 species
Done: 23 of 53 species
Done: 24 of 53 species
Done: 25 of 53 species
Done: 26 of 53 species
Done: 27 of 53 species
Done: 28 of 53 species
Done: 29 of 53 species
Done: 30 of 53 species
Done: 31 of 53 species
Done: 32 of 53 species
Done: 33 of 53 species
Done: 34 of 53 species
Done: 35 of 53 species
Done: 36 of 53 species
Done: 37 of 53 species
Done: 38 of 53 species
Done: 39 of 53 species
Done: 40 of 53 species
Done: 41 of 53 species
Done: 42 of 53 species
Done: 43 of 53 species
Done: 44 of 53 speci

In [8]:
# Save to np array
swapped_hot_arr = np.asarray(swapped_hot, dtype=np.int8)
swapped_var_arr = np.asarray(swapped_var, dtype=np.int16)

# Save into npz file
## arr_0 = onehot sequences
## arr_1 = 72 variables
outfile = "./swap/swapped_70225.npz"
np.savez(outfile, swapped_hot_arr, swapped_var_arr)
print("Done!")

Done!
