In [20]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import h5py

In [3]:
def one_hot_encode(df, col='utr', seq_len=50):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        a = np.array([nuc_d[x] for x in seq])
        vectors[i] = a
    return vectors

## UTR MRL

In [6]:
#df = pd.read_pickle('../data/egfp_unmod_1.pkl')
df = pd.read_csv('../data/UTR/raw_csv/GSM3130435_egfp_unmod_1.csv')
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# The training set has 260k UTRs and the test set has 20k UTRs.
e_test = df.iloc[:20000]
e_train = df.iloc[20000:]

# One-hot encode both training and test UTRs
seq_e_train = one_hot_encode(e_train,seq_len=50)
seq_e_test = one_hot_encode(e_test, seq_len=50)

# Scale the training mean ribosome load values
e_train.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_train.loc[:,'rl'].values.reshape(-1,1))
e_test.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_test.loc[:,'rl'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_train.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_train.loc[:,'rl'].values.reshape(-1,1))


In [21]:
file_name = '../data/UTR/utr_mrl.h5'
h5f = h5py.File(file_name, 'w')
h5f.create_dataset('x_train',data = seq_e_train)
h5f.create_dataset('y_train',data= e_train.loc[:,'scaled_rl'].values)
h5f.create_dataset('x_test',data = seq_e_test)
h5f.create_dataset('y_test',data = e_test.loc[:,'scaled_rl'].values)
h5f.close()

## UTR poly

In [23]:
df = pd.read_csv('../data/UTR/raw_csv/GSM3130435_egfp_unmod_1.csv')
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# The training set has 260k UTRs and the test set has 20k UTRs.
e_test = df[:20000]
e_train = df[20000:]

seq_e_train = one_hot_encode(e_train,seq_len=50)
seq_e_test = one_hot_encode(e_test, seq_len=50)

rfractions = ['r' + str(x) for x in range(14)]
# Convert e_train into a matrix with relative polysome abundance for each UTR.
e_train = e_train[rfractions].values
e_test = e_test[rfractions].values

In [26]:
seq_e_test.shape

(20000, 50, 4)

In [27]:
e_test.shape

(20000, 14)

In [28]:
file_name = '../data/UTR/utr_poly.h5'
h5f = h5py.File(file_name, 'w')
h5f.create_dataset('x_train',data = seq_e_train)
h5f.create_dataset('y_train',data= e_train)
h5f.create_dataset('x_test',data = seq_e_test)
h5f.create_dataset('y_test',data = e_test)
h5f.close()