In [None]:
from models.dnabert.models import sequence_classification

In [None]:
from transformers import BertModel
from data_dir import pretrained_3kmer_dir
from utils.data_generator import _data_generator_mtl
from multitask_learning import init_model_mtl
import os
dataloader = _data_generator_mtl()
model = init_model_mtl(pretrained_3kmer_dir, head="bert", config=os.path.join('models', 'config', 'mtl.json'))
for step, batch in enumerate(dataloader):
    input_ids, attn_mask, label_prom, label_ss, label_polya = tuple(t for t in batch)
    output = model(input_ids, attn_mask)
    # print(output.keys())
    bert = model.shared_layer
    bert_output = bert(input_ids, attn_mask)[0]
    print(bert_output.shape)
    print(output['prom'].shape)



In [None]:
import torch
loss_fn = torch.nn.CrossEntropyLoss()
pred = torch.tensor([
        [[0.5, 0.25], [0.5, 0.25], [0.5, 0.25]]
    ])
label = torch.tensor([[1, 0, 0]])
for p, l in zip(pred, label):
    loss = loss_fn(
        p,
        l
    )
    print(loss)

In [None]:
import torch 
loss_fn = torch.nn.CrossEntropyLoss()
pred = torch.tensor([[0.0, 0.9]])
label = torch.tensor([0])
loss = loss_fn(pred, label)
loss

In [None]:
from transformers import BertForMaskedLM
from data_dir import pretrained_3kmer_dir
model = BertForMaskedLM.from_pretrained(pretrained_3kmer_dir)
bert_layer = model.bert
sum(p.numel() for p in bert_layer.parameters() if p.requires_grad)

In [None]:
mtl_seq = [
    ["ATGC" * 128, 0, 0, 1],
    ["TGCG" * 128, 1, 0, 0],
    ["GACT" * 128, 0, 1, 0],
    ["CACG" * 128, 0, 0, 0],
    ["CCAT" * 128, 0, 0, 0],
]
import pandas as pd
import os
df = pd.DataFrame(mtl_seq, columns=["sequence", "label_prom", "label_ss", "label_polya"])
mtl_sample_csv = os.path.join("sample", "mtl", "sample.csv")
os.makedirs(os.path.dirname(mtl_sample_csv), exist_ok=True)
if os.path.exists(mtl_sample_csv):
    os.remove(mtl_sample_csv)
df.to_csv(mtl_sample_csv, index=False)

In [None]:
from random import randint
import os
import pandas as pd
sequences = ['ATGC' * 128, 'TGAC' * 128, 'GATC' * 128, "AGCC" * 128, "TGGA" * 128]
labels = [''.join(['E' if randint(0, 255) % 2 == 0 else '.' for i in range(len(s))]) for s in sequences]

seq2seq_datasample_csv = os.path.join("sample", "seq2seq", "sample.csv")
os.makedirs(os.path.dirname(seq2seq_datasample_csv), exist_ok=True)
if os.path.exists(seq2seq_datasample_csv):
    os.remove(seq2seq_datasample_csv)

df = pd.DataFrame(
    [[seq, label] for seq, label in zip(sequences, labels)],
    columns=['sequence', 'label']
)
df.to_csv(seq2seq_datasample_csv, index=False)



In [None]:
from sequential_labelling import init_adamw_optimizer, init_seq2seq_model
from data_dir import pretrained_3kmer_dir
from utils.utils import load_checkpoint

model = init_seq2seq_model(pretrained_3kmer_dir)
optimizer = init_adamw_optimizer(model.parameters())

model, optimizer, config = load_checkpoint(os.path.join("result", "sample", "2022-03-17", "checkpoint-4.pth"), model, optimizer)
print(model)
print(optimizer)
print(config)


In [None]:
"""
Get random 100 instance from csv file.
"""
import os
import pandas as pd
src_df = pd.read_csv(os.path.join("workspace", "mtl", "train.sample.csv"))
target_df = src_df.sample(150, random_state=1337)
target_df.to_csv(os.path.join("workspace", "mtl", "train.sample.150.csv"), index=False)


In [None]:
import torch

torch.cuda.device_count()

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(os.path.join("pretrained", "3-new-12w-0"))
tokenizer.all_special_tokens
tokenizer.all_special_ids

In [None]:
import pandas as pd
import os

gene_table = os.path.join("data", "genome_ucsc_gene_annotation_tables", "genome_ucsc_gene_annotation_tables")
df = pd.read_csv(gene_table)
df.head(10)

In [None]:
import pandas as pd
import os

gene_annotation = os.path.join("data", "human", "ncbi-genomes-2022-07-21", "GCF_000001405.39_GRCh38.p13_genomic.gff", "GCF_000001405.39_GRCh38.p13_genomic.with-header.gff")
df = pd.read_csv(gene_annotation, sep='\t')
df.head(10)


In [None]:
best_ref_seq_genes = df[(df["source"] == "BestRefSeq") & (df["type"] == "gene")]
best_ref_seq_genes.head(30)

In [None]:
from torch.optim import Adamw

optimizer = Adamw()

In [None]:
import os
import json
from transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification, BertConfig
pretrained_path = os.path.join("pretrained", "3-new-12w-0")

# initialize config
config = BertConfig.from_pretrained(pretrained_path)

# modify config
config.architectures[0] = "BertForTokenClassification"
config.num_labels = 8


In [None]:
# token classification model
model = BertForTokenClassification.from_pretrained(pretrained_path, config=config)
model


In [None]:
from models import DNABertForTokenClassification

m = DNABertForTokenClassification.from_pretrained(pretrained_path, config=config)
m

In [None]:
import torch
input_ids = torch.randint(0, 69, (5, 512))
attn_mask = torch.Tensor([[0 for j in range(512)] for i in range(5)])

output = m(input_ids, attn_mask)
print(output[0].shape)


In [None]:
from transformers import BertConfig, BertForMaskedLM
import os

pretrained = os.path.join("pretrained", "3-new-12w-0")
config = BertConfig.from_pretrained(pretrained)
model = BertForMaskedLM.from_pretrained(pretrained, config=config)
model.bert

In [None]:
from transformers import BertConfig, BertForMaskedLM
import os
import json

from models.seqlab import DNABERT_SL

pretrained = os.path.join("pretrained", "3-new-12w-0")
config = BertConfig.from_pretrained(pretrained)
model = BertForMaskedLM.from_pretrained(pretrained, config=config)
bert = model.bert
model_config = os.path.join("models", "config", "seqlab", "base.json")
config = json.load(open(model_config, "r"))
model = DNABERT_SL(bert, config)
model

In [None]:
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers import BertTokenizer
import numpy as np

def merge_kmer_token(value):
    from data_preparation import merge_kmer
    original_sequence = merge_kmer(
        value.split(' ')
    )
    return original_sequence


log_path = os.path.join("prediction", "error_analysis_log_sorted.csv")
log_df = pd.read_csv(log_path)

tokenizer = BertTokenizer.from_pretrained(
    os.path.join("pretrained", "3-new-12w-0")
)

data_dir = os.path.join("workspace", "seqlab-latest")
train_data_path = os.path.join(data_dir, "gene_index.01_train_validation_ss_all_pos_train.csv")
train_df = pd.read_csv(train_data_path)
validation_data_path = os.path.join(data_dir, "gene_index.01_train_validation_ss_all_pos_validation.csv")
validation_df = pd.read_csv(validation_data_path)
test_data_path = os.path.join("workspace", "seqlab-latest", "gene_index.01_test_ss_all_pos.csv")
test_df = pd.read_csv(test_data_path)

# compute cosine similarity worst item in log.
train_npy = np.load(os.path.join("workspace", "seqlab-latest", "train_data.npy"))
arr_scores = []
arr_max = []
arr_min = []
arr_mean = []
for i, r in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Cosine Similarity"):
    test_input_ids = tokenizer.encode(r["sequence"])
    similarity = cosine_similarity([test_input_ids], train_npy)
    scores = similarity[0]
    arr_max.append(np.max(scores))
    arr_min.append(np.min(scores))
    arr_mean.append(np.mean(scores))
    scores_str = " ".join([str(a) for a in scores])
    arr_scores.append(scores_str)

dataframe = pd.DataFrame(data={
    # "score": arr_scores,
    "max": arr_max,
    "min": arr_min,
    "mean": arr_mean
})
dataframe.to_csv(
    os.path.join("error-analysis", "data-comparison", "cosine_similarity_compact.csv"), index=False
)

    

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([[1,2,3,4,5]], [[1,3,5,2,4], [2,4,6,8,1]])
similarity

In [None]:
import os
import pandas as pd

path = os.path.join("workspace\seqlab-latest\gene_index.01_train_validation_ss_all_pos_validation.csv")
df = pd.read_csv(path)

In [None]:
df.head(10)

In [None]:
def compute_length(row):
    seq = row['sequence'].split(' ')
    length = len(seq)
    return length

df["length"] = df.apply(lambda row: compute_length(row), axis=1)



In [None]:
import numpy as np

np.unique(df["length"])

In [None]:
s = "CTCCCTGGGAGGGCGTGGATGATGGTGGGAGAGGAGCCCCACTGTGGAAGTCTGACCCCCACATCGCCCCACCTTCCCCAG"
from data_preparation import kmer

a = kmer(s, 3, 1)
len(a)
b = " ".join(a)
b

In [None]:
import tensorflow as tf
ret = tf.keras.utils.to_categorical(2, 3)
ret

In [None]:
from run_baseline_basic import preprocessing
import os
import numpy as np
import tensorflow as tf
from tf_model.wisesty import bilstm

num_classes = 3

work_dir = os.path.join("workspace", "baseline", "basic")
training_data_path = os.path.join(work_dir, "train_validation_train.csv")
validation_data_path = os.path.join(work_dir, "train_validation_validation.csv")
test_data_path = os.path.join(work_dir, "test.csv")

# X_train, Y_train = preprocessing(training_data_path, num_classes=num_classes)
X_val, Y_val = preprocessing(validation_data_path, num_classes=num_classes)
# X_test, Y_test = preprocessing(test_data_path, num_classes=num_classes)

model = bilstm(num_classes=num_classes)

print(X_val[0:2], Y_val[0:2])

history = model.fit(X_val, Y_val)


In [None]:
import torch

cross_entropy = torch.nn.CrossEntropyLoss()
pred = torch.tensor([[0, 0, 0.5, 1]])
target = torch.tensor([-100])
loss = cross_entropy(pred, target)
print(loss)

In [None]:
from data_preparation import kmer

s = "abcdefghijk"
kmer(s, 4, 2)

In [None]:
import pandas as pd
import os

df = pd.read_csv(os.path.abspath("X:/ss-stride.1/gene_test.csv"))
df.head(5)

In [None]:
from dna2vec.multi_k_model import MultiKModel
import os

filepath = os.path.join("pretrained", "dna2vec", "dna2vec.w2v")
mk_model = MultiKModel(filepath)
mk_model.vector('AAA')

In [1]:
from models.dnabert import DNABERTLSTMForTokenClassification, RNNConfig
import os
import json

pretrained_path = os.path.join("pretrained", "3-new-12w-0")
lin1_pretrained_path = os.path.join("pretrained", "dnabert-sl-lin1")
rnn_config = RNNConfig(hidden_size=768, num_layers=2, bidirectional=True)

head_config_path = os.path.join("models", "config", "seqlab", "base.lin1.json")
head_config = json.load(open(head_config_path, "r"))

model = DNABERTLSTMForTokenClassification.from_pretrained(pretrained_path, rnn_config, head_config)
model

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at pretrained\3-new-12w-0 were not used when initializing DNABERTLSTMForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing DNABERTLSTMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABERTLSTMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
So

DNABERTLSTMForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [2]:
import torch
input_ids = torch.randint(0, 69, (5, 512))
attention_mask = torch.ones(5, 512)

output, rnn_output, bert_output = model(input_ids, attention_mask)
print(output.shape)

torch.Size([5, 512, 8])
