In [1]:
import sys

from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm as tqdm_auto
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import pytorch_lightning as pl

In [2]:
tqdm.pandas()

In [3]:
sys.path.append("../verification/")
from shuffle_dinucl import shuffle_seq, shuffle_seq_dinucl

In [4]:
sys.path.append("../../regression_multiple")
import dataset_regression as utrdata
from lit_regressor import RNARegressor

## Creating dataset

In [5]:
SEQUENCE_N = 1_000_000

In [6]:
CELLTYPE_CODES_UTR3 = {"c1": 0,
                       "c2": 1,
                       "c4": 2,
                       "c6": 3,
                       "c17": 4,
                       "c13": 5,
                       "c10": 6}

CELLTYPE_CODES_UTR5 = {"c1": 0,
                       "c2": 1,
                       "c4": 2,
                       "c6": 3,
                       "c17": 4}

In [7]:
#src_seqs = pd.Series(pd.read_csv("../../data/raw/UTR5_sequence_counts_05_23_23.tsv", sep="\t", index_col=0, header=[0, 1, 2]).index.values)
src_seqs = pd.read_csv("../../data/UTR5_zinb_norm_singleref_2023-05-23.csv")["seq"].drop_duplicates()
src_seqs

0         ATTGCTGCAGACGCTCACCCCAGACACTCACTGCACCGGAGTGAGC...
10        TGGAAGGGCCGTGTTCGTGTTGGCAAAGAAGGTCGGCTGCTGAGCC...
20        ACTTCCGTTGAGTTCCGCCTCGCCGTTTGTCCCTTGCGGTACCCGT...
30        TTTGTCCCTTGCGGTACCCGTCCGCATACGAATCTAGCCCGGGAAC...
40        ATACGAATCTAGCCCGGGAACCGAGTTGCGGGAGTGCGGTCTGTGC...
                                ...                        
216040    CTCCGGCTCGACGCCGGCTCTCTTTTTGACGCCCCGCCGCCGGGGT...
216050    CGGCTGCGGCTGCGGCTGCGGCTGCTACTGCTACGCTCCTAGCTTG...
216060    CCTGGAGCCTCCGCGCCGGCTCAGCCTGGGGGCGGGCTCCGGTCCG...
216070    GCAGAGTCTGCGGACCCGGCGCCGAGGCGGCCACCCGAGACGCGGC...
216080    CCGTCGTCTCCTCCGCGTCCCCGCCCGCCAGCTGCTGTCGGAGGTT...
Name: seq, Length: 21609, dtype: object

In [None]:
np.random.seed(777)
sampled = src_seqs.sample(SEQUENCE_N, replace=True)
mononucl_shuffle = sampled.progress_apply(shuffle_seq)
dinucl_shuffle = sampled.progress_apply(shuffle_seq_dinucl)

In [9]:
seqs = pd.concat([pd.DataFrame({"seq": mononucl_shuffle, "shuffle": "mono"}), 
                  pd.DataFrame({"seq": dinucl_shuffle, "shuffle": "di"})]).reset_index(drop=True)
seqs

Unnamed: 0,seq,shuffle
0,CCGGAACGACGCGCAGAGCCCCCCCGTGGTAACCGCGGGGCCTGAC...,mono
1,GGTCTTACGAGGGTATCCTAAAGAGTCAGATAGGTGGCTCGTGCTC...,mono
2,CAACAGGCACGCATCTCATGGAAGCTCGCAGTGCATGAAGCACTAC...,mono
3,CGCCGCCTGTGTTGGCACGGGGGATCGAATAGGTATCGGGTCTCCT...,mono
4,GGACATTCTCCAAGGTGTGTTGATACAACGGGAGGTGCACCTGGGG...,mono
...,...,...
1999995,AGGAAGGTCGCTGGCCGTGCTCCGACCCTTGGCGCTAGGGAGAGAA...,di
1999996,GGGGATTGCTGCCGGGAAACGCGAGGGGTGGTGGGGCGGTGCGACC...,di
1999997,CCCGCACCTCACCGCTTCCCCCTTCAGGCCGCTCAGCTCGGCGTCC...,di
1999998,CAGTGCCCCACTGCTCCTGTACCTGGGGCGGCCGCGTGCTCAGTGC...,di


In [10]:
df = []
for ct_code in CELLTYPE_CODES_UTR5.keys():
    subdf = seqs.copy()
    subdf["cell_type"] = ct_code
    df.append(subdf)
df = pd.concat(df).sort_values(by=["shuffle", "seq", "cell_type"]).reset_index(drop=True)
df

Unnamed: 0,seq,shuffle,cell_type
0,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c1
1,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c17
2,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c2
3,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c4
4,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c6
...,...,...,...
9999995,TTTTTTTTTCCTTTTCCCTTCTACCGGCCCACGTAGACCCCCCCCT...,mono,c1
9999996,TTTTTTTTTCCTTTTCCCTTCTACCGGCCCACGTAGACCCCCCCCT...,mono,c17
9999997,TTTTTTTTTCCTTTTCCCTTCTACCGGCCCACGTAGACCCCCCCCT...,mono,c2
9999998,TTTTTTTTTCCTTTTCCCTTCTACCGGCCCACGTAGACCCCCCCCT...,mono,c4


In [11]:
df["diff"] = 0.0
df["mass_center"] = 0.0

In [12]:
batch_size = 1024

In [13]:
num_workers = 32

In [14]:
generated_set = utrdata.UTRData(
    df=df,
    features=("sequence", "positional", "conditions"),
    construct_type="utr5",
    augment=False,
    augment_test_time=False,
    augment_kws=dict(
        extend_left=0,
        extend_right=0,
        shift_left=0,
        shift_right=0,
        revcomp=False,
    ),
)

In [15]:
# Creating DataLoaders
dl_gen = DataLoader(
    generated_set,
    batch_size=batch_size,
    num_workers=num_workers,
    shuffle=False,
    drop_last=False
)

In [16]:
ckpt_path = "../../regression_multiple/model_validation/model-utr5-deltas-epoch=9-step=840.ckpt"

In [17]:
progressbar_callback = pl.callbacks.TQDMProgressBar(refresh_rate=0.5)
trainer = pl.Trainer(
    callbacks=[progressbar_callback],
    logger=False,
    accelerator="gpu",
    devices=1,
    deterministic=True,
    # gradient_clip_val=1e-5,
    # gradient_clip_algorithm="norm",
)

loaded_model = RNARegressor.load_from_checkpoint(ckpt_path)
prediction = trainer.predict(model=loaded_model, dataloaders=dl_gen)

gen_pred, _ = zip(*prediction)
gen_pred = torch.concat(gen_pred)
gen_pred = gen_pred.numpy()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

In [18]:
gen_pred

array([[ 0.01219579,  2.1338458 ],
       [-0.06468936,  2.039298  ],
       [ 0.10998474,  2.257606  ],
       ...,
       [ 0.07770254,  2.543181  ],
       [ 0.03244496,  2.4952643 ],
       [-0.098713  ,  2.3697565 ]], dtype=float32)

In [21]:
score_df = df[["seq", "shuffle", "cell_type"]].copy()
score_df["pred_mass_center"] = gen_pred[:, 1]

In [22]:
score_df.to_csv("shuffle_result_utr5.csv.gz", index=False, compression="gzip")

In [23]:
score_df_sample = score_df.iloc[:500000]

In [24]:
score_df_sample

Unnamed: 0,seq,shuffle,cell_type,pred_mass_center
0,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c1,2.133846
1,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c17,2.039298
2,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c2,2.257606
3,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c4,2.136955
4,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c6,2.073413
...,...,...,...,...
499995,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c1,2.577312
499996,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c17,2.583788
499997,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c2,2.561730
499998,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c4,2.530760


In [25]:
score_df_sample["a"] = score_df_sample["seq"].str.count("A")
score_df_sample["c"] = score_df_sample["seq"].str.count("C")
score_df_sample["g"] = score_df_sample["seq"].str.count("G")
score_df_sample["t"] = score_df_sample["seq"].str.count("T")
score_df_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df_sample["a"] = score_df_sample["seq"].str.count("A")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df_sample["c"] = score_df_sample["seq"].str.count("C")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df_sample["g"] = score_df_sample["seq"].str.count("G")
A value is trying to b

Unnamed: 0,seq,shuffle,cell_type,pred_mass_center,a,c,g,t
0,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c1,2.133846,18,11,14,7
1,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c17,2.039298,18,11,14,7
2,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c2,2.257606,18,11,14,7
3,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c4,2.136955,18,11,14,7
4,AAAAAAAAATGCGGAGCTGGAGTGAAGCTCCACCCTAAATTCCGCA...,di,c6,2.073413,18,11,14,7
...,...,...,...,...,...,...,...,...
499995,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c1,2.577312,10,22,11,7
499996,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c17,2.583788,10,22,11,7
499997,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c2,2.561730,10,22,11,7
499998,AGATCAGAGCCTGCCAGCACTCTCCTCCAGGCTCCACCTACCGGCC...,di,c4,2.530760,10,22,11,7


---