In [1]:
import sys

from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm as tqdm_auto
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import pytorch_lightning as pl

In [2]:
tqdm.pandas()

In [3]:
sys.path.append("../verification/")
from shuffle_dinucl import shuffle_seq, shuffle_seq_dinucl

In [4]:
sys.path.append("../../regression_multiple")
import dataset_regression as utrdata
from lit_regressor import RNARegressor

## Creating dataset

In [5]:
SEQUENCE_N = 1_000_000

In [6]:
CELLTYPE_CODES_UTR3 = {"c1": 0,
                       "c2": 1,
                       "c4": 2,
                       "c6": 3,
                       "c17": 4,
                       "c13": 5,
                       #"c10": 6
                      }

CELLTYPE_CODES_UTR5 = {"c1": 0,
                       "c2": 1,
                       "c4": 2,
                       "c6": 3,
                       "c17": 4}

In [7]:
#src_seqs = pd.Series(pd.read_csv("../../data/raw/UTR5_sequence_counts_05_23_23.tsv", sep="\t", index_col=0, header=[0, 1, 2]).index.values)
src_seqs = pd.read_csv("../../data/UTR3_zinb_norm_singleref_2023-05-23.csv")["seq"].drop_duplicates()
src_seqs

0         TGCAGTTTTGACCTCCCAGGCTCAAGCGATCCTCCTGCCTCAGCCT...
11        ATCAAAAAGCAGGCCAGATTCTAATCAAAATCAGGTAAATTTTAAT...
22        ATTTTAGTTTGCCCAAATAATATCTTGAAAATGCTCTGAATTTTAC...
33        TTTCCCTTTTTCCCTCCCTCTATTCTCTTCACTAACATTGGAAATT...
44        TTCCTTCAGAGAATTCTCCTTTTCTTCTATGTTTCTGACTGCAATA...
                                ...                        
312631    AATGGTAAAGATTTTTACTCAGTGTGTGCTGAAACACTAAATTAAA...
312642    GCTAAAGCATTGCTTATTCAGTGGTATTCAGTAGATAAGATCTATT...
312653    TGTGCTTCCTAAGAGTACAAACCTGAGCATATGTCCAGGCTTGCAA...
312664    TAGGTGGTGATCTTAAATGGGTGAGATGGAACGAGAGCACACATTA...
312675    AGGAGGCAACTGTGGCATTGCTTCCTTAACCAGCTCATGGTGTGTG...
Name: seq, Length: 28426, dtype: object

In [8]:
np.random.seed(777)
sampled = src_seqs.sample(SEQUENCE_N, replace=True)
mononucl_shuffle = sampled.progress_apply(shuffle_seq)
dinucl_shuffle = sampled.progress_apply(shuffle_seq_dinucl)

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [9]:
seqs = pd.concat([pd.DataFrame({"seq": mononucl_shuffle, "shuffle": "mono"}), 
                  pd.DataFrame({"seq": dinucl_shuffle, "shuffle": "di"})]).reset_index(drop=True)
seqs

Unnamed: 0,seq,shuffle
0,GAATGAAACGAATATAACTACTAATTAACTATATTAATTCTTTTAA...,mono
1,TTCACGTCTCGTTTATGCAGCCATTGTACTTTGACTTGTGAATAAG...,mono
2,CTATATACAGTCCACGATCTTTTTCCTGGATTTACCTCAGACTAAT...,mono
3,AATCCAAGGGCACCGAGGAAATCGGCCCCCCTGAGAAAACTCCACG...,mono
4,GAATATCCAGCGTGTGATCGGCATCTGTTGCTTGTATAAAATTAGA...,mono
...,...,...
1999995,GGGCTTGTAATGTCCTAATCAGAAAGTTAGAGACCCCCTCGAGTTC...,di
1999996,GTCCAGCAATCTGTGGTCCTTTAATCAACCCACCAGGCCTGGTCCA...,di
1999997,GCATTTATGTCAAATGATAAAAAACCTCATGGTTGACTAAGCTACT...,di
1999998,ACTGGTGCATTGTGTATTCCATCTGAACTGCCATTAGGTCAGCAAG...,di


In [10]:
df = []
for ct_code in CELLTYPE_CODES_UTR3.keys():
    subdf = seqs.copy()
    subdf["cell_type"] = ct_code
    df.append(subdf)
df = pd.concat(df).sort_values(by=["shuffle", "seq", "cell_type"]).reset_index(drop=True)
df

Unnamed: 0,seq,shuffle,cell_type
0,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,c1
1,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,c13
2,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,c17
3,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,c2
4,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,c4
...,...,...,...
11999995,TTTTTTTTTTTTTATCGCGATTCTATTTTTATGTTTCTTTGAAACA...,mono,c13
11999996,TTTTTTTTTTTTTATCGCGATTCTATTTTTATGTTTCTTTGAAACA...,mono,c17
11999997,TTTTTTTTTTTTTATCGCGATTCTATTTTTATGTTTCTTTGAAACA...,mono,c2
11999998,TTTTTTTTTTTTTATCGCGATTCTATTTTTATGTTTCTTTGAAACA...,mono,c4


In [11]:
df["diff"] = 0.0
df["mass_center"] = 0.0

In [12]:
batch_size = 1024

In [13]:
num_workers = 32

In [14]:
generated_set = utrdata.UTRData(
    df=df,
    features=("sequence", "positional", "conditions"),
    construct_type="utr3",
    augment=False,
    augment_test_time=False,
    augment_kws=dict(
        extend_left=0,
        extend_right=0,
        shift_left=0,
        shift_right=0,
        revcomp=False,
    ),
)

In [15]:
# Creating DataLoaders
dl_gen = DataLoader(
    generated_set,
    batch_size=batch_size,
    num_workers=num_workers,
    shuffle=False,
    drop_last=False
)

In [16]:
ckpt_path = "../../regression_multiple/model_validation/model-utr3-deltas-epoch=9-step=1330.ckpt"

In [17]:
progressbar_callback = pl.callbacks.TQDMProgressBar(refresh_rate=0.5)
trainer = pl.Trainer(
    callbacks=[progressbar_callback],
    logger=False,
    accelerator="gpu",
    devices=1,
    deterministic=True,
    # gradient_clip_val=1e-5,
    # gradient_clip_algorithm="norm",
)

loaded_model = RNARegressor.load_from_checkpoint(ckpt_path)
prediction = trainer.predict(model=loaded_model, dataloaders=dl_gen)

gen_pred, _ = zip(*prediction)
gen_pred = torch.concat(gen_pred)
gen_pred = gen_pred.numpy()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

In [18]:
gen_pred

array([[-0.02643219,  2.4683876 ],
       [-0.00552829,  2.5085034 ],
       [ 0.06912179,  2.585143  ],
       ...,
       [ 0.03560264,  2.8178961 ],
       [-0.09097424,  2.7718694 ],
       [ 0.05196464,  2.8543499 ]], dtype=float32)

In [19]:
score_df = df[["seq", "shuffle", "cell_type"]].copy()
score_df["pred_mass_center"] = gen_pred[:, 1]

In [21]:
score_df.to_csv("shuffle_result_utr3.csv.gz", index=False, compression="gzip")

In [22]:
score_df_pivot = score_df.pivot(index=["seq", "shuffle"], columns="cell_type", values="pred_mass_center").reset_index()
score_df_pivot

cell_type,seq,shuffle,c1,c13,c17,c2,c4,c6
0,AAAAAAAAAAAAAAAGTTCTTTGTAGGACAAACCAGGCATCTTGGA...,di,2.468388,2.508503,2.585143,2.486583,2.485378,2.455960
1,AAAAAAAAAAAAGGGTTTTGTTTAAAAAAAAATTCCCTAAACTCAC...,di,2.538426,2.535110,2.648535,2.563907,2.506713,2.688980
2,AAAAAAAAAAAAGTCAAAACAAATGTGGATGTCCCATGCTACTTCA...,di,2.294210,2.388838,2.361428,2.253601,2.519911,2.450523
3,AAAAAAAAAAAATTCAGGCACTGGCAAGGCTAACCGGTATTAACTC...,di,2.390013,2.446905,2.168969,2.307835,2.222588,2.430125
4,AAAAAAAAAAACACAGCCATTTATAATCTACTAAAGGATTCACATG...,di,2.546810,2.543269,2.653380,2.625014,2.599340,2.676578
...,...,...,...,...,...,...,...,...
1999995,TTTTTTTTTTTTCTTTTTCTATCTTGGACAGCTCCCCAGCTTCCGA...,di,2.725799,2.700554,2.996952,2.833940,2.924107,2.855114
1999996,TTTTTTTTTTTTTATCGCGATTCTATTTTTATGTTTCTTTGAAACA...,mono,3.016410,2.672373,2.847543,2.817896,2.771869,2.854350
1999997,TTTTTTTTTTTTTCAACCCAAGTCTGTGTCTAACCCTTGACAAAGT...,di,2.730782,2.662123,2.831269,2.724526,2.585554,2.535211
1999998,TTTTTTTTTTTTTGTCCCACAGAAGAATTTCCAGACACCCCTTTTA...,di,2.890640,2.736877,2.925937,2.866374,2.752781,2.941293


In [23]:
score_df_pivot.to_csv("shuffle_result_pivot_utr3.csv.gz", index=False, compression="gzip")

In [None]:
score_df_sample = score_df.iloc[:500000]

In [None]:
score_df_sample

In [None]:
score_df_sample["a"] = score_df_sample["seq"].str.count("A")
score_df_sample["c"] = score_df_sample["seq"].str.count("C")
score_df_sample["g"] = score_df_sample["seq"].str.count("G")
score_df_sample["t"] = score_df_sample["seq"].str.count("T")
score_df_sample

---