In [1]:
import hugging_face.tiger as hf
print(hf.CONTEXT_3P)
print(hf.CONTEXT_5P)
print(hf.TARGET_LEN)

2024-06-16 17:17:17.990360: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-16 17:17:19.229529: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-06-16 17:17:19.229623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


0
3
26


2024-06-16 17:17:21.405789: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2024-06-16 17:17:21.405850: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [2]:
import pandas as pd

In [3]:
# project-wide nucleotide tokens
NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T'], [0, 1, 2, 3]))

# relevant columns
INDEX_COLS = ['gene', 'guide_id']
LFC_COLS = ['lfc_r1', 'lfc_r2', 'lfc_r3']
SEQUENCE_FEATS = ['guide_seq', 'target_seq', '5p_context', '3p_context']
FEATURE_GROUPS = {
    'target location': [
        'loc_utr_5p',
        'loc_cds',
        'loc_utr_3p',
        'log_gene_len',
        # 'strand'
    ],
    'junction overlap': [
        'junction_olap_5p',
        'junction_olap_3p',
    ],
    'junction proximity': [
        'junction_dist_5p',
        'junction_dist_3p',
    ],
    'guide secondary structure': [
        'direct_repeat',
        'g_quad',
    ],
    'guide MFE': [
        'mfe',
    ],
    'hybridization MFE': [
        'hybrid_mfe_1_23',
        'hybrid_mfe_15_9',
        'hybrid_mfe_3_12',
    ],
    'target accessibility': [
        'log_unpaired',
        'log_unpaired_11',
        'log_unpaired_19',
        'log_unpaired_25',
    ],
    # 'localization': [
    #     'perc_gene_nuc',
    #     'perc_junc_nuc',
    # ],
    # 'usage': [
    #     'perc_junc_use',
    # ],
}
SCALAR_FEATS = [feature for group_features in FEATURE_GROUPS.values() for feature in group_features]
SCALAR_FEATS_INDELS = set(SCALAR_FEATS) - {'hybrid_mfe_1_23', 'hybrid_mfe_15_9', 'hybrid_mfe_3_12'}
UNIT_SCALED_FEATS = ['loc_utr_5p', 'loc_cds', 'loc_utr_3p',
                     'direct_repeat', 'g_quad',
                     'perc_gene_nuc', 'perc_junc_nuc', 'perc_junc_use']


In [4]:
def load_data(dataset, pm_only=False, indels=False, holdout='targets', scale_non_seq_feats=False):
    """
    Loads specified dataset and its corresponding non-targeting data (if it exists)
    :param dataset: which dataset to use
    :param pm_only: only include perfectly matched guides
    :param indels: whether to include guides with insertions or deletions
    :param holdout: what should be held out in each fold {genes, guides, targets}
    :param scale_non_seq_feats: whether to scale non-sequence features to unit interval
    :return: two DataFrames containing targeting and non-targeting data, respectively
    """
    data_file = os.path.join('data-processed', dataset + '.bz2')
    assert os.path.exists(data_file), 'if assertion fails, run: python data.py'

    # load data
    data = pd.read_pickle(data_file)

    # filtered data as requested
    if pm_only:
        data = data[data.guide_type == 'PM']
    elif not indels:
        data = data[data.guide_type.isin(['PM', 'SM', 'DM', 'RDM', 'TM', 'RTM'])]

    # set the folds
    if holdout == 'genes':
        data['fold'] = data.get('gene')
    elif holdout == 'guides':
        data['fold'] = data.get('guide_fold')
    elif holdout == 'targets':
        data['fold'] = data.get('target_fold')
    else:
        raise NotImplementedError

    # load non-targeting data if it exists
    nt_file = os.path.join('data-processed', dataset + '-nt.bz2')
    data_nt = pd.read_pickle(nt_file) if os.path.exists(nt_file) else None

    # scalar feature transformation and scaling
    for feature in SCALAR_FEATS:
        if feature in data.columns:
            if 'junction_dist' in feature:
                data[feature] = data[feature].apply(lambda x: np.log10(1 + np.abs(x)))
            if scale_non_seq_feats and feature not in UNIT_SCALED_FEATS:
                data[feature] -= data[feature].min()
                scale = data[feature].max()
                if scale > 0:
                    data[feature] /= scale
                print(feature, data[feature].min(), data[feature].max())

    return data, data_nt


def label_and_filter_data(data, data_nt, nt_quantile=0.01, method='MinActiveRatio', min_active_ratio=0.1, quiet=True):
    """
    Labels guide activity as active (LFC < specified non-targeting quantile) and filters our non-essential genes
    :param data: screen data as a DataFrame
    :param data_nt: non-targeting data as a DataFrame
    :param nt_quantile: non-targeting quantile that defines the threshold under which guides are considered active
    :param method: essential gene filter method
    :param min_active_ratio: used by MinActiveRatio--genes with an active guide ratio less than this value get removed
    :param quiet: silences Lilliefors non-targeting distribution tests
    :return: filtered data with observed labels (active vs inactive)
    """
    # non-targeting data is available
    if data_nt is not None and len(data_nt) > 0:

        # compute mean of replicates
        data_nt['lfc'] = data_nt[LFC_COLS].mean(axis=1)

        # set active threshold based on quantile of non-targeting distribution (assumed to be normal)
        threshold = norm.ppf(q=nt_quantile, loc=data_nt['lfc'].mean(), scale=data_nt['lfc'].std())
        if not quiet:
            _, p_val = lilliefors(data_nt['lfc'].values)
            print('Lilliefors p-value of NT replicate medians: {:.4e}'.format(p_val))
            print('A {:.4f} quantile yields an LFC threshold of {:.4f}'.format(nt_quantile, threshold))

    # non-targeting data is unavailable, so use default threshold
    else:
        threshold = -0.5

    # provided target values
    if set(LFC_COLS).issubset(data.columns):

        # take mean of replicates as target value
        data['observed_lfc'] = data[LFC_COLS].mean(axis=1)
        data = data[~data['observed_lfc'].isna()]
        assert sum(np.isnan(data['observed_lfc'].values)) == 0

    # label guides as active/inactive
    if 'observed_lfc' in data.columns:
        data['observed_label'] = data['observed_lfc'] < threshold
    elif 'q' in data.columns:
        data['observed_label'] = (data['q'] < 0.1) & (data['z'] < 0.)  # significant at FDR 0.1 in the correct direction

    # keep only single-unique and common junctions
    if 'junction_category' in data.columns:
        data = data.loc[data['junction_category'].isin(['single_unique', 'common'])]

    # apply filter
    if method == 'NoFilter':
        return data
    elif method == 'MinActiveRatio' and 'observed_label' in data.columns:
        df = pd.DataFrame(data[data.guide_type == 'PM'].groupby('gene')['observed_label'].mean().rename('active ratio'))
        return data[data['gene'].isin(df[df['active ratio'] >= min_active_ratio].index.values)]
    else:
        raise NotImplementedError

In [5]:
import os
import numpy as np
# import utils
import pandas as pd
# # import tensorflow as tf
# from data import load_data, label_and_filter_data, model_inputs, SCALAR_FEATS
# from models import build_model, train_model, test_model
# from normalization import get_normalization_object

# load, label, and filter data
data = load_data('off-target')#, pm_only=args.pm_only, indels=args.indels, holdout=args.holdout)
# data = label_and_filter_data(*data, args.nt_quantile, args.filter_method, args.min_active_ratio)

In [14]:
from scipy.stats import norm
data = label_and_filter_data(*data)

In [15]:
data

Unnamed: 0,gene,guide_id,guide_type,lfc_r1,lfc_r2,lfc_r3,guide_seq,target_seq,5p_context,3p_context,...,hybrid_mfe_3_12,log_unpaired,log_unpaired_11,log_unpaired_19,log_unpaired_25,guide_fold,target_fold,fold,observed_lfc,observed_label
0,SNRNP200,crRNA0349:349-371,PM,-3.819780,-3.044753,-1.709841,AGCCTCCCGTAACTACTCTACCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-2.858125,True
1,SNRNP200,crRNA0349:349-371_SM_P1-A:G,SM,-3.445606,-3.799614,-3.011046,AGCCTCCCGTAACTACTCTACCG,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,5,9,9,-3.418755,True
2,SNRNP200,crRNA0349:349-371_SM_P2-C:T,SM,-2.723421,-2.168753,-3.163318,AGCCTCCCGTAACTACTCTACTA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,2,9,9,-2.685164,True
3,SNRNP200,crRNA0349:349-371_SM_P3-C:A,SM,-1.698941,-2.151166,-0.916911,AGCCTCCCGTAACTACTCTAACA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-18.7,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-1.589006,True
4,SNRNP200,crRNA0349:349-371_SM_P4-A:T,SM,-2.435223,-3.483880,-2.727288,AGCCTCCCGTAACTACTCTTCCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-17.4,-4.052203,-0.760030,-0.557244,-0.797968,10,9,9,-2.882130,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118262,CCT7,crRNA1067:1086-1108,PM,-0.037639,0.239060,-0.230432,ACTCCTCCTAGACTTCTCCTGTT,TGAGGAGGATCTGAAGAGGACAA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,TGATGGCCTGTGGAGGCTCAATCCAGACCAGTGTGAATGCTCTGTC...,...,-24.4,-2.975745,-0.574501,-0.210624,-1.639831,3,6,6,-0.009671,False
118263,CCT7,crRNA1576:1632-1654,PM,-1.877323,-2.398506,-2.439100,CCACGCCTAGTTACGCGACTGTC,GGTGCGGATCAATGCGCTGACAG,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAACCAT...,...,-24.2,-6.231938,-0.663498,-1.973189,-0.726269,6,6,6,-2.238310,True
118264,CCT7,crRNA0299:299-321,PM,-0.839628,-1.834822,-0.205472,CTCCGTTTCGTTGTTAAAGATTA,GAGGCAAAGCAACAATTTCTAAT,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,GATGGGGCCACAATTCTGAAACTTCTTGATGTTGTCCATCCTGCAG...,...,-16.9,-1.870503,-0.402233,-1.353552,-1.252402,2,4,4,-0.959974,True
118265,CCT7,crRNA1573:1629-1651,PM,-2.477016,-4.210653,-1.269312,ATACCACGCCTAGTTACGCGACT,TATGGTGCGGATCAATGCGCTGA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAAC...,...,-23.4,-5.529942,-0.156943,-0.841205,-0.854693,6,7,7,-2.652327,True


In [16]:
normalizer = get_normalization_object('No')(data)
data = normalizer.normalize_targets(data)


In [17]:
def get_normalization_object(method: str):
    if method == 'No':
        return NoNormalization
    elif method == 'FrequentistQuantile':
        return FrequentistQuantileNormalization
    elif method == 'UnitInterval':
        return UnitIntervalNormalization
    elif method == 'UnitVariance':
        return UnitVarianceNormalization
    elif method == 'UnitMeanOfSquares':
        return UnitMeanOfSquaresNormalization
    elif method == 'ZeroMeanUnitVariance':
        return ZeroMeanUnitVarianceNormalization
    elif method == 'DepletionRatio':
        return DepletionRatioNormalization
    elif method == 'QuantileMatching':
        return QuantileMatchingNormalization
    else:
        raise NotImplementedError


class Normalization(object):
    def __init__(self, df: pd.DataFrame):
        self.original_lfc = df[['gene', 'guide_seq', 'observed_lfc']].copy().set_index(['gene', 'guide_seq'])
        assert not self.original_lfc.index.has_duplicates

    def normalize_targets(self, df: pd.DataFrame):
        raise NotImplementedError

    def denormalize_observations(self, df: pd.DataFrame):
        # restore observed values
        if 'observed_lfc' in df.columns:
            df = pd.merge(
                left=df,
                right=self.original_lfc,
                left_on=['gene', 'guide_seq'],
                right_index=True,
                suffixes=('_normalized', '')
            )
        if 'observed_pm_lfc' in df.columns:
            df_pm = df.loc[df.guide_type == 'PM', ['gene', 'target_seq', 'observed_lfc']]
            df_pm = df_pm.rename(columns={'observed_lfc': 'observed_pm_lfc'}).set_index(['gene', 'target_seq'])
            df = pd.merge(
                left=df,
                right=df_pm,
                how='left',
                left_on=['gene', 'target_seq'],
                right_index=True,
                suffixes=('_normalized', '')
            )
        return df

    def denormalize_predictions(self, df: pd.DataFrame):
        df['predicted_lfc_normalized'] = df['predicted_lfc']
        if 'predicted_pm_lfc' in df.columns:
            df['predicted_pm_lfc_normalized'] = df['predicted_pm_lfc']
        return df

    def denormalize_targets_and_predictions(self, df: pd.DataFrame):
        return self.denormalize_predictions(self.denormalize_observations(df.copy()))


class NoNormalization(Normalization):
    def __init__(self, df: pd.DataFrame):
        super().__init__(df)
        self.output_fn = 'linear'

    def normalize_targets(self, df: pd.DataFrame):
        return df

    def denormalize_observations(self, df: pd.DataFrame):
        return df

    def denormalize_predictions(self, df: pd.DataFrame):
        return df

In [8]:
"""
Fix for server
# export CUDNN_PATH="/home/amirali/dtsui/anaconda3/envs/tiger/lib/python3.10/site-packages/nvidia/cudnn"
# export LD_LIBRARY_PATH="$CUDNN_PATH/lib":"/usr/local/cuda/lib64"
# # ...
# export PATH="$PATH":"/usr/local/cuda/bin"
# export TF_CPP_MIN_LOG_LEVEL="2"
"""
import sys
# print(sys.path)
path_to_remove = '/nethome/dtsui31/.local/lib/python3.10/site-packages'
if path_to_remove in sys.path:
    sys.path.remove(path_to_remove)
import tensorflow as tf

In [34]:
target_seq = tf.stack([tf.constant(list(seq)) for seq in data['target_seq']], axis=0)
left_context = tf.ragged.stack([tf.constant(list(seq), tf.string) for seq in data['5p_context']], axis=0)
right_context = tf.ragged.stack([tf.constant(list(seq), tf.string) for seq in data['3p_context']], axis=0)
guide_seq = tf.ragged.stack([tf.constant(list(seq)) for seq in data['guide_seq']], axis=0)

nucleotide_table = tf.lookup.StaticVocabularyTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()) + ['N'], dtype=tf.string),
        values=tf.constant(list(NUCLEOTIDE_TOKENS.values()) + [255], dtype=tf.int64)),
    num_oov_buckets=1)
target_tokens = nucleotide_table.lookup(target_seq)
tokens_5p = nucleotide_table.lookup(left_context)

2024-06-04 16:23:12.449634: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 4435340568 exceeds 10% of free system memory.
2024-06-04 16:23:26.143513: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 6425274192 exceeds 10% of free system memory.


In [37]:
context = 5
if isinstance(context, (list, tuple)):
    context_5p, context_3p = tuple(context)
else:
    context_5p = context_3p = context

# add target context
max_len_5p = max(tokens_5p.bounding_shape()[1], context_5p)

In [44]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [45]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def model_inputs(data, context, *, scalar_feats=(), target_feats=(), include_replicates=False, max_context=100):
    """
    Prepares a dictionary of model inputs and target values from the provided DataFrame
    :param data: panda's DataFrame containing model inputs and target values
    :param context: amount of target context
    :param scalar_feats: scalar features to be provided to the model
    :param target_feats: additional sequence features
    :param include_replicates: whether to include raw replicates
    :param max_context: maximum amount of up- and down-stream context (reduce RAM usage for guides with 1kb available)
    :return: dictionary containing model inputs and target values
    """
    # shuffle rows (tensorflow shuffling is approximate, doing a full shuffle here makes it more exact)
    data = data.sample(frac=1)

    # keep only data without NaN values for the scalar features
    data = data[(~data[list(scalar_feats)].isna()).product(1) == 1]

    # trim context to some reasonable amount
    data['5p_context'] = data['5p_context'].apply(lambda s: s[-min(len(s), max_context):]).astype(str)
    data['3p_context'] = data['3p_context'].apply(lambda s: s[:min(len(s), max_context)]).astype(str)

    # load target and guide sequences
    target_seq = tf.stack([tf.constant(list(seq)) for seq in data['target_seq']], axis=0)
    left_context = tf.ragged.stack([tf.constant(list(seq), tf.string) for seq in data['5p_context']], axis=0)
    right_context = tf.ragged.stack([tf.constant(list(seq), tf.string) for seq in data['3p_context']], axis=0)
    guide_seq = tf.ragged.stack([tf.constant(list(seq)) for seq in data['guide_seq']], axis=0)

    # convert nucleotides to integer codes
    nucleotide_table = tf.lookup.StaticVocabularyTable(
        initializer=tf.lookup.KeyValueTensorInitializer(
            keys=tf.constant(list(NUCLEOTIDE_TOKENS.keys()) + ['N'], dtype=tf.string),
            values=tf.constant(list(NUCLEOTIDE_TOKENS.values()) + [255], dtype=tf.int64)),
        num_oov_buckets=1)
    target_tokens = nucleotide_table.lookup(target_seq)
    tokens_5p = nucleotide_table.lookup(left_context)
    tokens_3p = nucleotide_table.lookup(right_context)
    guide_tokens = tf.RaggedTensor.from_row_splits(values=nucleotide_table.lookup(guide_seq.values),
                                                   row_splits=guide_seq.row_splits).to_tensor(255)
    target_tokens = tf.cast(target_tokens, tf.uint8)
    guide_tokens = tf.cast(guide_tokens, tf.uint8)

    # these operations are only necessary if using additional target context sequence
    if isinstance(context, (list, tuple)):
        context_5p, context_3p = tuple(context)
    else:
        context_5p = context_3p = context

    # add target context
    max_len_5p = max(tokens_5p.bounding_shape()[1], context_5p)
    tokens_5p = pad_sequences(tokens_5p.to_list(), maxlen=max_len_5p, dtype='uint8', padding='pre', value=255)
    tokens_5p = tf.constant(tokens_5p[:, tokens_5p.shape[1]-context_5p:tokens_5p.shape[1]])
    max_len_3p = max(tokens_3p.bounding_shape()[1], context_3p)
    tokens_3p = pad_sequences(tokens_3p.to_list(), maxlen=max_len_3p, dtype='uint8', padding='post', value=255)
    tokens_3p = tf.constant(tokens_3p[:, :context_3p])

    # assemble dictionary of core model inputs
    inputs = {
        # data identifiers for downstream analysis
        'gene': tf.constant(data['gene'], tf.string),
        'target_seq': tf.constant(data['target_seq'], tf.string),
        'guide_id': tf.constant(data['guide_id'], tf.string),
        'guide_seq': tf.constant(data['guide_seq'], tf.string),
        'guide_type': tf.constant(data['guide_type'], tf.string),
        # sequence features
        'target_tokens': target_tokens,
        'guide_tokens': guide_tokens,
        '5p_tokens': tokens_5p,
        '3p_tokens': tokens_3p,
    }

    # target values
    if 'observed_lfc' in data.columns:
        inputs.update({'observed_lfc': tf.constant(data['observed_lfc'], tf.float32)})
    if 'observed_label' in data.columns:
        inputs.update({'observed_label': tf.constant(data['observed_label'], tf.uint8)})
    if 'sample_weights' in data.columns:
        inputs.update({'sample_weights': tf.constant(data['sample_weights'], tf.float32)})
    if include_replicates:
        inputs.update({'replicate_lfc': tf.constant(data[LFC_COLS], tf.float32)})

    # add sequence features
    features = tf.zeros(target_tokens.shape[:1] + [context_5p + target_tokens.shape[1] + context_3p] + [0])
    for feature in target_feats:
        if feature in data.columns:
            feature = tf.constant(np.stack(data[feature]), tf.float32)
            features = tf.concat([features, tf.expand_dims(feature, axis=-1)], axis=-1)
        else:
            raise Exception('Missing target feature: ' + feature)
    inputs.update({'target_features': features})

    # add optional features
    for feature in scalar_feats:
        if feature in data.columns:
            inputs.update({feature: tf.constant(data[feature], tf.float32)})
        else:
            raise Exception('Missing scalar feature: ' + feature)

    return inputs

In [81]:
train_data = model_inputs(data, context=(2,0))

In [56]:
train_data['target_tokens']

<tf.Tensor: shape=(93145, 23), dtype=uint8, numpy=
array([[3, 2, 0, ..., 0, 0, 2],
       [3, 1, 2, ..., 0, 3, 1],
       [1, 0, 2, ..., 3, 1, 3],
       ...,
       [2, 0, 3, ..., 1, 0, 3],
       [2, 2, 2, ..., 2, 3, 3],
       [2, 0, 1, ..., 3, 0, 1]], dtype=uint8)>

In [52]:
from typing import Union

In [63]:
class SequenceModelWithNonSequenceFeatures(object):
    def __init__(self):
        self.non_sequence_features = None

    def concatenate_non_sequence_features(self, data, x, scalar_feats):
        non_sequence_features = []
        for feature in (scalar_feats if self.non_sequence_features is None else self.non_sequence_features):
            if feature in data.keys():
                non_sequence_features.append(feature)
                x = tf.concat([x, tf.cast(data[feature][:, None], tf.float32)], axis=1)

        if self.non_sequence_features is None:
            self.non_sequence_features = non_sequence_features
        else:
            assert set(self.non_sequence_features) == set(non_sequence_features)

        return x


class TargetSequenceWithRBP(SequenceModelWithNonSequenceFeatures):
    def __init__(self, guide_len: int, context_5p: int, context_3p: int, *, rbp_list: list, **kwargs):
        super().__init__()
        self.input_parser = layers.TargetSequenceAndPositionalFeatures(guide_len, context_5p, context_3p, len(rbp_list))
        self.rbp_list = rbp_list

        # model declaration
        self.model = tf.keras.Sequential(name='TargetSequenceWithRBP', layers=[
            layers.SequenceSequentialWithNonSequenceBypass(
                input_parser=self.input_parser,
                sequence_layers=[
                    layers.ReduceAndConcatTargetRBP(self.input_parser.feature_channels),
                    tf.keras.layers.Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'),
                    tf.keras.layers.Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'),
                    tf.keras.layers.MaxPool1D(pool_size=2, padding='same'),
                    tf.keras.layers.Flatten(),
                    tf.keras.layers.Dropout(0.25),
                ]),
            tf.keras.layers.Dense(units=128, activation='sigmoid'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(units=32, activation='sigmoid'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(1, activation='linear')
        ])

    def pack_inputs(self, data: dict, scalar_feats: Union[list, tuple] = tuple(SCALAR_FEATS)):

        # one-hot encode, flatten, and concatenate target (and context) sequence tokens
        x = tf.concat([data['5p_tokens'], data['target_tokens'], data['3p_tokens']], axis=1)
        x = tf.reshape(tf.one_hot(x, depth=4), [len(data['target_tokens']), -1])

        # if we are using additional non-sequence but positional target features
        if self.input_parser.feature_channels > 0:
            x = tf.concat([x, tf.reshape(data['target_features'], [len(data['target_tokens']), -1])], axis=1)

        # concatenate and log available non-sequence features
        x = self.concatenate_non_sequence_features(data, x, scalar_feats)

        # target values
        y = data['observed_lfc'] if 'observed_lfc' in data.keys() else None

        return x, y

class OneHotSequenceModel(SequenceModelWithNonSequenceFeatures):
    def __init__(self, target_len: int, context_5p: int, context_3p: int, use_guide_seq: bool, pad_guide_seq: bool):
        super().__init__()
        self.input_parser = layers.OneHotInputParser(target_len, context_5p, context_3p, use_guide_seq, pad_guide_seq)

    def pack_inputs(self, data: dict, scalar_feats: Union[list, tuple] = tuple(SCALAR_FEATS)):

        # one-hot encode, flatten, and concatenate target (and context) sequence tokens
        x = tf.concat([data['5p_tokens'], data['target_tokens'], data['3p_tokens']], axis=1)
        x = tf.reshape(tf.one_hot(x, depth=4), [len(data['target_tokens']), -1])

        # if we are using guide sequence, do the same to those tokens (but pad them first to match target + context)
        if self.input_parser.guide_len > 0:

            # pre- and post-pad guides with zero to match combined target and context sequence
            data_type = data['guide_tokens'].dtype
            pad_5p = 255 * tf.ones([data['guide_tokens'].shape[0], data['5p_tokens'].shape[1]], dtype=data_type)
            pad_3p = 255 * tf.ones([data['guide_tokens'].shape[0], data['3p_tokens'].shape[1]], dtype=data_type)
            guide_tokens = tf.concat([pad_5p, data['guide_tokens'], pad_3p], axis=1)
            x = tf.concat([x, tf.reshape(tf.one_hot(guide_tokens, depth=4), [len(data['guide_tokens']), -1])], axis=1)

        # concatenate and log available non-sequence features
        x = self.concatenate_non_sequence_features(data, x, scalar_feats)

        # target values
        y = data['observed_lfc'] if 'observed_lfc' in data.keys() else None
        w = data['sample_weights'] if 'sample_weights' in data.keys() else None

        return x, y, w

    def parse_input_scores(self, scores):

        # unpack scores
        target_scores, guide_scores, non_sequence_scores = self.input_parser.call(scores)

        # load scores into DataFrame
        score_dict = dict()
        for nt, token in NUCLEOTIDE_TOKENS.items():
            score_dict.update({'target:' + nt: target_scores[..., token].numpy().tolist()})
            score_dict.update({'guide:' + nt: guide_scores[..., token].numpy().tolist()})
        for i, feature in enumerate(self.non_sequence_features):
            score_dict.update({feature: non_sequence_scores[:, i]})
        df = pd.DataFrame(score_dict)

        return df

import layers

class Tiger1D(OneHotSequenceModel):
    def __init__(self, target_len: int, context_5p: int, context_3p: int, use_guide_seq: bool, **kwargs):
        OneHotSequenceModel.__init__(self, target_len, context_5p, context_3p, use_guide_seq, pad_guide_seq=True)

        self.model = tf.keras.Sequential(name='Tiger1D', layers=[
            layers.SequenceSequentialWithNonSequenceBypass(
                input_parser=self.input_parser,
                sequence_layers=[
                    layers.AlignOneHotEncoding1D(use_guide_seq),
                    tf.keras.layers.Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'),
                    tf.keras.layers.Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'),
                    tf.keras.layers.MaxPool1D(pool_size=2, padding='same'),
                    tf.keras.layers.Flatten(),
                    tf.keras.layers.Dropout(0.25),
                ]),
            tf.keras.layers.Dense(units=128, activation='sigmoid'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(units=32, activation='sigmoid'),
            tf.keras.layers.Dropout(0.1),
            tf.keras.layers.Dense(1, activation=kwargs.get('output_fn') or 'linear')
        ])

In [None]:
def common_parser_arguments():

    # common arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=2048, help='tensorflow batch size')
    parser.add_argument('--context', type=str, default=None, help='amount of target sequence context')
    parser.add_argument('--dataset', type=str, default='off-target', help='which dataset to use')
    parser.add_argument('--debug', action='store_true', default=False, help='debug mode will run models eagerly')
    parser.add_argument('--fig_ext', type=str, default='.pdf', help='which file extension to use when saving plots')
    parser.add_argument('--filter_method', type=str, default=None, help='gene filtering method')
    parser.add_argument('--holdout', type=str, default='targets', help='how to assemble cross-validation folds')
    parser.add_argument('--indels', action='store_true', default=False, help='include targets with indels')
    parser.add_argument('--kwargs', type=str, default=None, help='model hyper-parameters')
    parser.add_argument('--loss', type=str, default='log_cosh', help='training loss function')
    parser.add_argument('--min_active_ratio', type=float, default=None, help='ratio of active guides to keep a gene')
    parser.add_argument('--model', type=str, default=None, help='model name')
    parser.add_argument('--normalization', type=str, default=None, help='normalization method')
    parser.add_argument('--normalization_kwargs', type=str, default=None, help='normalization parameters')
    parser.add_argument('--nt_quantile', type=float, default=None, help='active guide non-targeting quantile threshold')
    parser.add_argument('--pm_only', action='store_true', default=False, help='use only perfect match guides')
    parser.add_argument('--use_guide_seq', action='store_true', default=False, help='use guide sequence for PM only')
    parser.add_argument('--seed', type=int, default=None, help='random number seed')
    parser.add_argument('--seq_only', action='store_true', default=False, help='sequence only model')

    return parser


In [57]:
from models import build_model, train_model, test_model
import utils

# script arguments
parser = utils.common_parser_arguments()
# parser.add_argument('--mm_only', action='store_true', default=False, help='use only mismatched guides')
# args = utils.parse_common_arguments(parser)
# assert not (args.mm_only and args.pm_only)

# train mode

def build_model(name, target_len, context_5p, context_3p, use_guide_seq, loss_fn, debug=False, **kwargs):
    if name == 'Tiger1D':
        model = Tiger1D(target_len, context_5p, context_3p, use_guide_seq, **kwargs)
        optimizer = tf.optimizers.Adam(1e-3)
    elif name == 'Tiger2D':
        model = Tiger2D(target_len, context_5p, context_3p, use_guide_seq, **kwargs)
        optimizer = tf.optimizers.Adam(1e-3)
    elif name == 'TargetSequenceWithRBP':
        model = TargetSequenceWithRBP(target_len, context_5p, context_3p, **kwargs)
        optimizer = tf.optimizers.Adam(1e-3)
    # elif name == 'TranscriptTransformer':
    #     model = TranscriptTransformer(target_len, guide_len, use_guide_seq, **kwargs)
    #     optimizer = tf.optimizers.Adam(5e-4)
    else:
        raise NotImplementedError
    model.model.compile(optimizer=optimizer, loss=loss_fn, weighted_metrics=[], run_eagerly=debug)

    return model

model = build_model(name='test',
                    target_len=train_data['target_tokens'].shape[1],
                    context_5p=train_data['5p_tokens'].shape[1],
                    context_3p=train_data['3p_tokens'].shape[1],
                    use_guide_seq=False,
                    loss_fn='log_cosh',
                    # debug=args.debug,
                    # output_fn=normalizer.output_fn,
)
# model = train_model(model, train_data, train_data, args.batch_size)

# # accumulate targets and predictions for held-out fold
# predictions = pd.concat([predictions, test_model(model, valid_data)])

TypeError: build_model() missing 1 required positional argument: 'loss_fn'

In [82]:
model = build_model(name='Tiger1D',
    target_len=train_data['target_tokens'].shape[1],
    context_5p=train_data['5p_tokens'].shape[1],
    context_3p=train_data['3p_tokens'].shape[1],
    use_guide_seq=False,
    loss_fn='log_cosh',
    debug=False,
    output_fn=normalizer.output_fn)

In [83]:
predictions = test_model(model, train_data)

In [67]:
predictions

Unnamed: 0,target_seq,gene,guide_id,guide_seq,guide_type,observed_lfc,observed_label,predicted_lfc,observed_pm_lfc,predicted_pm_lfc
0,AAAACAAACCTTTATCAGGATGA,PSMD1,crRNA1573:1654-1676,TTTTGTTTGGAAATAGTCCTACT,PM,0.070935,0,-0.255048,0.070935,-0.255048
1,AAAACAACGGGAAGATTGAACTC,EIF3B,crRNA1748:1840-1862,TTTTGTTGCCCTTCTAACTTGAG,PM,0.134618,0,-0.277627,0.134618,-0.277627
2,AAAACAAGCTCCGGCTGCATTAC,NUP93,crRNA1224:1224-1246,TTTTGTTCGAGGCCGACGTAATG,PM,0.330254,0,-0.271856,0.330254,-0.271856
3,AAAACAATTTTACATGTGTAGCA,WDR75,crRNA0605:668-690,TTTTGTTAAAATGTACACATCGT,PM,0.077236,0,-0.274825,0.077236,-0.274825
4,AAAACAGAGATTCTTCCTCCCTT,SF3B1,crRNA2346:2511-2533,TTTTGTCTCTAAGAAGGAGGGAA,PM,-0.230626,0,-0.253106,-0.230626,-0.253106
...,...,...,...,...,...,...,...,...,...,...
93140,TTTTTGATGAAAATGCTGATGAG,BUB1B,crRNA0978:1016-1038,AAAAACTACTTTTACGACTACTC,PM,0.375851,0,-0.266358,0.375851,-0.266358
93141,TTTTTGATGATGACCTTGAAGAT,POLA1,crRNA0301:342-364,AAAAACTACTACTGGAACTTCTA,PM,0.021497,0,-0.261367,0.021497,-0.261367
93142,TTTTTGCAGCTGTTATCAATATC,BUB1B,crRNA2602:2658-2680,AAAAACGTCGACAATAGTTATAG,PM,0.030771,0,-0.262369,0.030771,-0.262369
93143,TTTTTGGAATTTTATCTCCTAGT,NUP133,crRNA0877:877-899,AAAAACCTTAAAATAGAGGATCA,PM,0.184384,0,-0.260188,0.184384,-0.260188


In [84]:
train_data['5p_tokens']

<tf.Tensor: shape=(93145, 2), dtype=uint8, numpy=
array([[2, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 1],
       [2, 0],
       [1, 1]], dtype=uint8)>

In [70]:
x, _, _ = model.pack_inputs(train_data)

In [78]:
train_data

{'gene': <tf.Tensor: shape=(93145,), dtype=string, numpy=
 array([b'SF3B1', b'AQR', b'NUP133', ..., b'EIF3B', b'NUP133', b'SF3B1'],
       dtype=object)>,
 'target_seq': <tf.Tensor: shape=(93145,), dtype=string, numpy=
 array([b'TGAACCGCTATTGATTGATGAAG', b'TCGTGCAGATGTTACCATAAATC',
        b'CAGGGACAAATATAACTTTTTCT', ..., b'GATGACCCTCAGGCCATAATCAT',
        b'GGGAGGTGATAAGACTTACAGTT', b'GACGATGACTATTCATCATCTAC'],
       dtype=object)>,
 'guide_id': <tf.Tensor: shape=(93145,), dtype=string, numpy=
 array([b'crRNA1682:1826-1848_RDM_P14-T:A|P20-T:C',
        b'crRNA1700:1831-1853_SM_P9-G:T',
        b'crRNA3473:3549-3571_RDM_P8-G:T|P19-C:A', ...,
        b'crRNA1178:1251-1273_TM_P1-A:T|P2-T:A|P3-G:A',
        b'crRNA0701:701-723_RTM_P2-A:C|P12-T:C|P16-C:T',
        b'crRNA0291:291-313_RDM_P14-A:T|P21-G:T'], dtype=object)>,
 'guide_seq': <tf.Tensor: shape=(93145,), dtype=string, numpy=
 array([b'ACTCGGCGAAAACTAACTACTTC', b'AGCACGTCTACAATTGTATTTAG',
        b'GTCCATGTTTATATTTAAAAAGA', ..., 

In [77]:
data

Unnamed: 0,gene,guide_id,guide_type,lfc_r1,lfc_r2,lfc_r3,guide_seq,target_seq,5p_context,3p_context,...,hybrid_mfe_3_12,log_unpaired,log_unpaired_11,log_unpaired_19,log_unpaired_25,guide_fold,target_fold,fold,observed_lfc,observed_label
0,SNRNP200,crRNA0349:349-371,PM,-3.819780,-3.044753,-1.709841,AGCCTCCCGTAACTACTCTACCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-2.858125,True
1,SNRNP200,crRNA0349:349-371_SM_P1-A:G,SM,-3.445606,-3.799614,-3.011046,AGCCTCCCGTAACTACTCTACCG,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,5,9,9,-3.418755,True
2,SNRNP200,crRNA0349:349-371_SM_P2-C:T,SM,-2.723421,-2.168753,-3.163318,AGCCTCCCGTAACTACTCTACTA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,2,9,9,-2.685164,True
3,SNRNP200,crRNA0349:349-371_SM_P3-C:A,SM,-1.698941,-2.151166,-0.916911,AGCCTCCCGTAACTACTCTAACA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-18.7,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-1.589006,True
4,SNRNP200,crRNA0349:349-371_SM_P4-A:T,SM,-2.435223,-3.483880,-2.727288,AGCCTCCCGTAACTACTCTTCCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-17.4,-4.052203,-0.760030,-0.557244,-0.797968,10,9,9,-2.882130,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118262,CCT7,crRNA1067:1086-1108,PM,-0.037639,0.239060,-0.230432,ACTCCTCCTAGACTTCTCCTGTT,TGAGGAGGATCTGAAGAGGACAA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,TGATGGCCTGTGGAGGCTCAATCCAGACCAGTGTGAATGCTCTGTC...,...,-24.4,-2.975745,-0.574501,-0.210624,-1.639831,3,6,6,-0.009671,False
118263,CCT7,crRNA1576:1632-1654,PM,-1.877323,-2.398506,-2.439100,CCACGCCTAGTTACGCGACTGTC,GGTGCGGATCAATGCGCTGACAG,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAACCAT...,...,-24.2,-6.231938,-0.663498,-1.973189,-0.726269,6,6,6,-2.238310,True
118264,CCT7,crRNA0299:299-321,PM,-0.839628,-1.834822,-0.205472,CTCCGTTTCGTTGTTAAAGATTA,GAGGCAAAGCAACAATTTCTAAT,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,GATGGGGCCACAATTCTGAAACTTCTTGATGTTGTCCATCCTGCAG...,...,-16.9,-1.870503,-0.402233,-1.353552,-1.252402,2,4,4,-0.959974,True
118265,CCT7,crRNA1573:1629-1651,PM,-2.477016,-4.210653,-1.269312,ATACCACGCCTAGTTACGCGACT,TATGGTGCGGATCAATGCGCTGA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAAC...,...,-23.4,-5.529942,-0.156943,-0.841205,-0.854693,6,7,7,-2.652327,True


In [76]:
np.shape(data)

(93145, 31)

In [96]:
transcript_seq = 'AAATCGGAGGGCATTGATGAGATGGTA'
TARGET_LEN = 23
CONTEXT_5P = 3
CONTEXT_3P = 0
# nucleotide tokens
NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T', 'N'], [0, 1, 2, 3, 255]))
NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))

def sequence_complement(sequence: list):
    return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]

def process_data(transcript_seq: str):

    # convert to upper case
    transcript_seq = transcript_seq.upper()

    # get all target sites
    target_seq = [transcript_seq[i: i + TARGET_LEN] for i in range(len(transcript_seq) - TARGET_LEN + 1)]

    # prepare guide sequences
    guide_seq = sequence_complement([seq[CONTEXT_5P:len(seq) - CONTEXT_3P] for seq in target_seq])

    # # model inputs
    # model_inputs = tf.concat([
    #     tf.reshape(one_hot_encode_sequence(target_seq, add_context_padding=False), [len(target_seq), -1]),
    #     tf.reshape(one_hot_encode_sequence(guide_seq, add_context_padding=True), [len(guide_seq), -1]),
    #     ], axis=-1)
    return target_seq, guide_seq#, model_inputs

target_seq, guide_seq = process_data(transcript_seq)

In [97]:
target_seq

['AAATCGGAGGGCATTGATGAGAT',
 'AATCGGAGGGCATTGATGAGATG',
 'ATCGGAGGGCATTGATGAGATGG',
 'TCGGAGGGCATTGATGAGATGGT',
 'CGGAGGGCATTGATGAGATGGTA']

In [88]:
guide_seq

['CACCGAGCGAAAACATTGGC',
 'ACCGAGCGAAAACATTGGCG',
 'CCGAGCGAAAACATTGGCGA',
 'CGAGCGAAAACATTGGCGAT',
 'GAGCGAAAACATTGGCGATC']

In [90]:
data

Unnamed: 0,gene,guide_id,guide_type,lfc_r1,lfc_r2,lfc_r3,guide_seq,target_seq,5p_context,3p_context,...,hybrid_mfe_3_12,log_unpaired,log_unpaired_11,log_unpaired_19,log_unpaired_25,guide_fold,target_fold,fold,observed_lfc,observed_label
0,SNRNP200,crRNA0349:349-371,PM,-3.819780,-3.044753,-1.709841,AGCCTCCCGTAACTACTCTACCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-2.858125,True
1,SNRNP200,crRNA0349:349-371_SM_P1-A:G,SM,-3.445606,-3.799614,-3.011046,AGCCTCCCGTAACTACTCTACCG,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,5,9,9,-3.418755,True
2,SNRNP200,crRNA0349:349-371_SM_P2-C:T,SM,-2.723421,-2.168753,-3.163318,AGCCTCCCGTAACTACTCTACTA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-19.8,-4.052203,-0.760030,-0.557244,-0.797968,2,9,9,-2.685164,True
3,SNRNP200,crRNA0349:349-371_SM_P3-C:A,SM,-1.698941,-2.151166,-0.916911,AGCCTCCCGTAACTACTCTAACA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-18.7,-4.052203,-0.760030,-0.557244,-0.797968,3,9,9,-1.589006,True
4,SNRNP200,crRNA0349:349-371_SM_P4-A:T,SM,-2.435223,-3.483880,-2.727288,AGCCTCCCGTAACTACTCTTCCA,TCGGAGGGCATTGATGAGATGGT,CGGGAGCAGAGATCTGCGGCCGTTTGCAGCTTGCGGTAGGGAGGCG...,GGGCATCATCTACAAGCCCAAAACTAAAGAGACTCGGGAGACCTAT...,...,-17.4,-4.052203,-0.760030,-0.557244,-0.797968,10,9,9,-2.882130,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118262,CCT7,crRNA1067:1086-1108,PM,-0.037639,0.239060,-0.230432,ACTCCTCCTAGACTTCTCCTGTT,TGAGGAGGATCTGAAGAGGACAA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,TGATGGCCTGTGGAGGCTCAATCCAGACCAGTGTGAATGCTCTGTC...,...,-24.4,-2.975745,-0.574501,-0.210624,-1.639831,3,6,6,-0.009671,False
118263,CCT7,crRNA1576:1632-1654,PM,-1.877323,-2.398506,-2.439100,CCACGCCTAGTTACGCGACTGTC,GGTGCGGATCAATGCGCTGACAG,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAACCAT...,...,-24.2,-6.231938,-0.663498,-1.973189,-0.726269,6,6,6,-2.238310,True
118264,CCT7,crRNA0299:299-321,PM,-0.839628,-1.834822,-0.205472,CTCCGTTTCGTTGTTAAAGATTA,GAGGCAAAGCAACAATTTCTAAT,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,GATGGGGCCACAATTCTGAAACTTCTTGATGTTGTCCATCCTGCAG...,...,-16.9,-1.870503,-0.402233,-1.353552,-1.252402,2,4,4,-0.959974,True
118265,CCT7,crRNA1573:1629-1651,PM,-2.477016,-4.210653,-1.269312,ATACCACGCCTAGTTACGCGACT,TATGGTGCGGATCAATGCGCTGA,AGAGTAGCGGAAGTGGTCCGTTCTCTTCCTCTCCCGGCCCAAGCTT...,CAGCAGCCTCTGAGGCTGCGTGCCTGATCGTGTCTGTAGATGAAAC...,...,-23.4,-5.529942,-0.156943,-0.841205,-0.854693,6,7,7,-2.652327,True
