In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
import os
import gc
import helpers

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, DistilBertPreTrainedModel, DistilBertModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from scipy.stats import spearmanr

2023-06-10 04:30:46.105812: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-10 04:30:46.209114: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PATH='/lustre/isaac/proj/UTK0196/codon-expression-data/fullTableForTrainning/'
RUN=1
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [3]:
def compute_metrics(epred):
    print('in compute_metrics')
    # Computes metrics from specialized output from huggingface
    #print(epred)

    logits = epred[0]
    labels = epred[1].reshape(-1,1)

    print(logits.shape, labels.shape)
    metrics = {}

    metrics['mse'] = mean_squared_error(labels, logits)
    metrics['mae'] = mean_absolute_error(labels, logits)
    metrics['r2'] = r2_score(labels, logits)
    rho, pval = spearmanr(labels, logits)
    metrics['spearmanr'] = rho
    #metrics['single_squared_errors'] = ((logits - labels).flatten()**2).tolist()

    return metrics

In [4]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print('in compute_loss')

        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        #print(outputs)
        
        logits = outputs.logits

        print(labels.shape, logits.shape)

        # compute custom loss
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(labels.float(), logits)
        return (loss, outputs) if return_outputs else loss

#https://stackoverflow.com/questions/67689219/copy-one-layers-weights-from-one-huggingface-bert-model-to-another
def setLayers(t, s, parts):
    target = dict(t.named_parameters())
    source = dict(s.named_parameters())

    #print(any('bert.embeddings.word_embeddings.weight' for val in source.keys()))

    for part in parts:
        target[part].data.copy_(source[part].data)  
        #target[part].requires_grad = False

In [5]:
parts = [
        'bert.embeddings.word_embeddings.weight',
        'bert.embeddings.position_embeddings.weight',
        'bert.embeddings.token_type_embeddings.weight',
        'bert.embeddings.LayerNorm.weight',
        'bert.embeddings.LayerNorm.bias',
        'bert.encoder.layer.0.attention.self.query.weight',
        'bert.encoder.layer.0.attention.self.query.bias',
        'bert.encoder.layer.0.attention.self.key.weight',
        'bert.encoder.layer.0.attention.self.key.bias',
        'bert.encoder.layer.0.attention.self.value.weight',
        'bert.encoder.layer.0.attention.self.value.bias',
        'bert.encoder.layer.0.attention.output.dense.weight',
        'bert.encoder.layer.0.attention.output.dense.bias',
        'bert.encoder.layer.0.attention.output.LayerNorm.weight',
        'bert.encoder.layer.0.attention.output.LayerNorm.bias',
        'bert.encoder.layer.0.intermediate.dense.weight',
        'bert.encoder.layer.0.intermediate.dense.bias',
        'bert.encoder.layer.0.output.dense.weight',
        'bert.encoder.layer.0.output.dense.bias',
        'bert.encoder.layer.0.output.LayerNorm.weight',
        'bert.encoder.layer.0.output.LayerNorm.bias',
        'bert.encoder.layer.1.attention.self.query.weight',
        'bert.encoder.layer.1.attention.self.query.bias',
        'bert.encoder.layer.1.attention.self.key.weight',
        'bert.encoder.layer.1.attention.self.key.bias',
        'bert.encoder.layer.1.attention.self.value.weight',
        'bert.encoder.layer.1.attention.self.value.bias',
        'bert.encoder.layer.1.attention.output.dense.weight',
        'bert.encoder.layer.1.attention.output.dense.bias',
        'bert.encoder.layer.1.attention.output.LayerNorm.weight',
        'bert.encoder.layer.1.attention.output.LayerNorm.bias',
        'bert.encoder.layer.1.intermediate.dense.weight',
        'bert.encoder.layer.1.intermediate.dense.bias',
        'bert.encoder.layer.1.output.dense.weight',
        'bert.encoder.layer.1.output.dense.bias',
        'bert.encoder.layer.1.output.LayerNorm.weight',
        'bert.encoder.layer.1.output.LayerNorm.bias',
        'bert.encoder.layer.2.attention.self.query.weight',
        'bert.encoder.layer.2.attention.self.query.bias',
        'bert.encoder.layer.2.attention.self.key.weight',
        'bert.encoder.layer.2.attention.self.key.bias',
        'bert.encoder.layer.2.attention.self.value.weight',
        'bert.encoder.layer.2.attention.self.value.bias',
        'bert.encoder.layer.2.attention.output.dense.weight',
        'bert.encoder.layer.2.attention.output.dense.bias',
        'bert.encoder.layer.2.attention.output.LayerNorm.weight',
        'bert.encoder.layer.2.attention.output.LayerNorm.bias',
        'bert.encoder.layer.2.intermediate.dense.weight',
        'bert.encoder.layer.2.intermediate.dense.bias',
        'bert.encoder.layer.2.output.dense.weight',
        'bert.encoder.layer.2.output.dense.bias',
        'bert.encoder.layer.2.output.LayerNorm.weight',
        'bert.encoder.layer.2.output.LayerNorm.bias',
        'bert.encoder.layer.3.attention.self.query.weight',
        'bert.encoder.layer.3.attention.self.query.bias',
        'bert.encoder.layer.3.attention.self.key.weight',
        'bert.encoder.layer.3.attention.self.key.bias',
        'bert.encoder.layer.3.attention.self.value.weight',
        'bert.encoder.layer.3.attention.self.value.bias',
        'bert.encoder.layer.3.attention.output.dense.weight',
        'bert.encoder.layer.3.attention.output.dense.bias',
        'bert.encoder.layer.3.attention.output.LayerNorm.weight',
        'bert.encoder.layer.3.attention.output.LayerNorm.bias',
        'bert.encoder.layer.3.intermediate.dense.weight',
        'bert.encoder.layer.3.intermediate.dense.bias',
        'bert.encoder.layer.3.output.dense.weight',
        'bert.encoder.layer.3.output.dense.bias',
        'bert.encoder.layer.3.output.LayerNorm.weight',
        'bert.encoder.layer.3.output.LayerNorm.bias',
        'bert.encoder.layer.4.attention.self.query.weight',
        'bert.encoder.layer.4.attention.self.query.bias',
        'bert.encoder.layer.4.attention.self.key.weight',
        'bert.encoder.layer.4.attention.self.key.bias',
        'bert.encoder.layer.4.attention.self.value.weight',
        'bert.encoder.layer.4.attention.self.value.bias',
        'bert.encoder.layer.4.attention.output.dense.weight',
        'bert.encoder.layer.4.attention.output.dense.bias',
        'bert.encoder.layer.4.attention.output.LayerNorm.weight',
        'bert.encoder.layer.4.attention.output.LayerNorm.bias',
        'bert.encoder.layer.4.intermediate.dense.weight',
        'bert.encoder.layer.4.intermediate.dense.bias',
        'bert.encoder.layer.4.output.dense.weight',
        'bert.encoder.layer.4.output.dense.bias',
        'bert.encoder.layer.4.output.LayerNorm.weight',
        'bert.encoder.layer.4.output.LayerNorm.bias',
        'bert.encoder.layer.5.attention.self.query.weight',
        'bert.encoder.layer.5.attention.self.query.bias',
        'bert.encoder.layer.5.attention.self.key.weight',
        'bert.encoder.layer.5.attention.self.key.bias',
        'bert.encoder.layer.5.attention.self.value.weight',
        'bert.encoder.layer.5.attention.self.value.bias',
        'bert.encoder.layer.5.attention.output.dense.weight',
        'bert.encoder.layer.5.attention.output.dense.bias',
        'bert.encoder.layer.5.attention.output.LayerNorm.weight',
        'bert.encoder.layer.5.attention.output.LayerNorm.bias',
        'bert.encoder.layer.5.intermediate.dense.weight',
        'bert.encoder.layer.5.intermediate.dense.bias',
        'bert.encoder.layer.5.output.dense.weight',
        'bert.encoder.layer.5.output.dense.bias',
        'bert.encoder.layer.5.output.LayerNorm.weight',
        'bert.encoder.layer.5.output.LayerNorm.bias',
        'bert.encoder.layer.6.attention.self.query.weight',
        'bert.encoder.layer.6.attention.self.query.bias',
        'bert.encoder.layer.6.attention.self.key.weight',
        'bert.encoder.layer.6.attention.self.key.bias',
        'bert.encoder.layer.6.attention.self.value.weight',
        'bert.encoder.layer.6.attention.self.value.bias',
        'bert.encoder.layer.6.attention.output.dense.weight',
        'bert.encoder.layer.6.attention.output.dense.bias',
        'bert.encoder.layer.6.attention.output.LayerNorm.weight',
        'bert.encoder.layer.6.attention.output.LayerNorm.bias',
        'bert.encoder.layer.6.intermediate.dense.weight',
        'bert.encoder.layer.6.intermediate.dense.bias',
        'bert.encoder.layer.6.output.dense.weight',
        'bert.encoder.layer.6.output.dense.bias',
        'bert.encoder.layer.6.output.LayerNorm.weight',
        'bert.encoder.layer.6.output.LayerNorm.bias',
        'bert.encoder.layer.7.attention.self.query.weight',
        'bert.encoder.layer.7.attention.self.query.bias',
        'bert.encoder.layer.7.attention.self.key.weight',
        'bert.encoder.layer.7.attention.self.key.bias',
        'bert.encoder.layer.7.attention.self.value.weight',
        'bert.encoder.layer.7.attention.self.value.bias',
        'bert.encoder.layer.7.attention.output.dense.weight',
        'bert.encoder.layer.7.attention.output.dense.bias',
        'bert.encoder.layer.7.attention.output.LayerNorm.weight',
        'bert.encoder.layer.7.attention.output.LayerNorm.bias',
        'bert.encoder.layer.7.intermediate.dense.weight',
        'bert.encoder.layer.7.intermediate.dense.bias',
        'bert.encoder.layer.7.output.dense.weight',
        'bert.encoder.layer.7.output.dense.bias',
        'bert.encoder.layer.7.output.LayerNorm.weight',
        'bert.encoder.layer.7.output.LayerNorm.bias',
        'bert.encoder.layer.8.attention.self.query.weight',
        'bert.encoder.layer.8.attention.self.query.bias',
        'bert.encoder.layer.8.attention.self.key.weight',
        'bert.encoder.layer.8.attention.self.key.bias',
        'bert.encoder.layer.8.attention.self.value.weight',
        'bert.encoder.layer.8.attention.self.value.bias',
        'bert.encoder.layer.8.attention.output.dense.weight',
        'bert.encoder.layer.8.attention.output.dense.bias',
        'bert.encoder.layer.8.attention.output.LayerNorm.weight',
        'bert.encoder.layer.8.attention.output.LayerNorm.bias',
        'bert.encoder.layer.8.intermediate.dense.weight',
        'bert.encoder.layer.8.intermediate.dense.bias',
        'bert.encoder.layer.8.output.dense.weight',
        'bert.encoder.layer.8.output.dense.bias',
        'bert.encoder.layer.8.output.LayerNorm.weight',
        'bert.encoder.layer.8.output.LayerNorm.bias',
        'bert.encoder.layer.9.attention.self.query.weight',
        'bert.encoder.layer.9.attention.self.query.bias',
        'bert.encoder.layer.9.attention.self.key.weight',
        'bert.encoder.layer.9.attention.self.key.bias',
        'bert.encoder.layer.9.attention.self.value.weight',
        'bert.encoder.layer.9.attention.self.value.bias',
        'bert.encoder.layer.9.attention.output.dense.weight',
        'bert.encoder.layer.9.attention.output.dense.bias',
        'bert.encoder.layer.9.attention.output.LayerNorm.weight',
        'bert.encoder.layer.9.attention.output.LayerNorm.bias',
        'bert.encoder.layer.9.intermediate.dense.weight',
        'bert.encoder.layer.9.intermediate.dense.bias',
        'bert.encoder.layer.9.output.dense.weight',
        'bert.encoder.layer.9.output.dense.bias',
        'bert.encoder.layer.9.output.LayerNorm.weight',
        'bert.encoder.layer.9.output.LayerNorm.bias',
        'bert.encoder.layer.10.attention.self.query.weight',
        'bert.encoder.layer.10.attention.self.query.bias',
        'bert.encoder.layer.10.attention.self.key.weight',
        'bert.encoder.layer.10.attention.self.key.bias',
        'bert.encoder.layer.10.attention.self.value.weight',
        'bert.encoder.layer.10.attention.self.value.bias',
        'bert.encoder.layer.10.attention.output.dense.weight',
        'bert.encoder.layer.10.attention.output.dense.bias',
        'bert.encoder.layer.10.attention.output.LayerNorm.weight',
        'bert.encoder.layer.10.attention.output.LayerNorm.bias',
        'bert.encoder.layer.10.intermediate.dense.weight',
        'bert.encoder.layer.10.intermediate.dense.bias',
        'bert.encoder.layer.10.output.dense.weight',
        'bert.encoder.layer.10.output.dense.bias',
        'bert.encoder.layer.10.output.LayerNorm.weight',
        'bert.encoder.layer.10.output.LayerNorm.bias',
        'bert.encoder.layer.11.attention.self.query.weight',
        'bert.encoder.layer.11.attention.self.query.bias',
        'bert.encoder.layer.11.attention.self.key.weight',
        'bert.encoder.layer.11.attention.self.key.bias',
        'bert.encoder.layer.11.attention.self.value.weight',
        'bert.encoder.layer.11.attention.self.value.bias',
        'bert.encoder.layer.11.attention.output.dense.weight',
        'bert.encoder.layer.11.attention.output.dense.bias',
        'bert.encoder.layer.11.attention.output.LayerNorm.weight',
        'bert.encoder.layer.11.attention.output.LayerNorm.bias',
        'bert.encoder.layer.11.intermediate.dense.weight',
        'bert.encoder.layer.11.intermediate.dense.bias',
        'bert.encoder.layer.11.output.dense.weight',
        'bert.encoder.layer.11.output.dense.bias',
        'bert.encoder.layer.11.output.LayerNorm.weight',
        'bert.encoder.layer.11.output.LayerNorm.bias',
]

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [7]:
print('Reading data...')
filelist = os.listdir(PATH) 
df_list = [pd.read_csv(PATH+file) for file in filelist]
df = pd.concat(df_list)

s = []
for file, _df in zip(filelist, df_list):
    species = file.partition('_')[0]
    l = [species]*len(_df)
    s.extend(l)
    
df['species'] = s

SPECIES = 'Musmusculus'

df = df[df['species'] == SPECIES] #train on only yeast sequences

df = helpers.add_codons_to_df(df, 'Sequence')
labels = normalize([np.log(df['median_exp'])])[0]

#print(labels)

#labels = labels.type(torch.LongTensor)

classification_df = pd.DataFrame({'text' : df['codons_cleaned'], 'label' : labels})
#MAX = int(max([(len(elem) / 3) for elem in df['codons_cleaned']])) #get max sequence length for padding
#MED = int(np.median([(len(elem) / 3) for elem in df['codons_cleaned']])) #get median sequence length for padding
#print(MED)
#trunc_len = int((MAX + MED) / 2) #set truncation somewhere between max and median
trunc_len = 1064

df_train, df_test = train_test_split(classification_df, test_size=0.2, random_state=1234)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=1234)

#print(len(df_val))

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

del classification_df
del df_train
del df_test
del df_val

print('Tokenizing...')
config = AutoConfig.from_pretrained('bert-base-uncased', max_position_embeddings=trunc_len, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained('./tokenizers/codonBERT', model_max_length=trunc_len, padding_side='left', truncation_side='right')


tokenized_ds_train = ds_train.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_val = ds_val.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_test = ds_test.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
del ds_train
del ds_val
del ds_test
gc.collect()

torch.cuda.empty_cache()

Reading data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['codons_cleaned'] = get_codon_list(df[col])


Tokenizing...


Map:   0%|          | 0/12330 [00:00<?, ? examples/s]

Map:   0%|          | 0/1371 [00:00<?, ? examples/s]

Map:   0%|          | 0/3426 [00:00<?, ? examples/s]

In [9]:
print('Building Model...')
pretrained_model = AutoModelForSequenceClassification.from_pretrained('./models/codonBERT_binary_reg_celegan-pre-norm2/checkpoint-74087/')
#model = AutoModelForSequenceClassification.from_config(config)

#setLayers(model, pretrained_model, parts) #setting weights from pretrained binary classifier except for last layers

Building Model...


In [11]:
training_args = TrainingArguments(
    output_dir='./models/codonBERT_binary_reg_{}-pre-norm2-1'.format(SPECIES),
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    optim="adamw_torch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="spearmanr",
    load_best_model_at_end=True,
    remove_unused_columns=True,
)

trainer = CustomTrainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
out = trainer.predict(test_dataset=tokenized_ds_test)
logits, labels, metrics = out

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


in compute_loss
torch.Size([8]) torch.Size([8, 1])


  return F.mse_loss(input, target, reduction=self.reduction)


in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8]) torch.Size([8, 1])
in compute_loss
torch.Size([8])

KeyboardInterrupt: 

In [None]:
with open('./results/codonBERT_binary_reg_scores_{}-pre-norm2.txt'.format(SPECIES),'w') as data: 
    data.write(str(metrics))

with open('./results/codonBERT_binary_reg_output_{}-pre-norm2.txt'.format(SPECIES),'w') as data:
    for val in logits:
        data.write(str(val)+'\n')