In [37]:
import pandas as pd
import numpy as np
import torch
import pickle
import os
import gc
import helpers

from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.nn import softmax
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, accuracy_score, precision_score, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
PATH='/lustre/isaac/proj/UTK0196/codon-expression-data/fullTableForTrainning/'
RUN=1
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [42]:
def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface
    logits = epred[0]
    probs = softmax(logits)
    preds = np.argmax(logits, axis=-1)
    labels = epred[1]

    metrics = {}
    #metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, probs, multi_class='ovr', average='micro')
    metrics['accuracy'] = SparseCategoricalAccuracy()(labels, logits)
    metrics['precision'] = precision_score(labels, preds, average='micro')
    metrics['recall'] = recall_score(labels, preds, average='micro')
        
    return metrics

In [4]:
print('Reading data...')
filelist = os.listdir(PATH) 
df_list = [pd.read_csv(PATH+file) for file in filelist]
df = pd.concat(df_list)

df = helpers.add_codons_to_df(df, 'Sequence')
low, high = df.median_exp.quantile([0.33, 0.67])
high_l = np.where(df['median_exp'] > high, 2, 0)
low_l = np.where(df['median_exp'] > low, 0, 1)
labels = high_l+low_l

Reading data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['codons_cleaned'] = get_codon_list(df[col])


In [26]:
classification_df = pd.DataFrame({'text' : df['codons_cleaned'], 'label' : labels})
#MAX = int(max([(len(elem) / 3) for elem in df['codons_cleaned']])) #get max sequence length for padding
#MED = int(np.median([(len(elem) / 3) for elem in df['codons_cleaned']])) #get median sequence length for padding

#trunc_len = int((MAX + MED) / 2) #set truncation somewhere between max and median
trunc_len=1024

df_train, df_test = train_test_split(classification_df, test_size=0.001, random_state=1234)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=1234)

del classification_df

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

del df_train
del df_val
del df_test

print('Tokenizing...')
config = AutoConfig.from_pretrained('distilbert-base-uncased', max_position_embeddings=trunc_len, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('../tokenizers/codonBERT', model_max_length=trunc_len, padding_side='left', truncation_side='right')

tokenized_ds_train = ds_train.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_val = ds_val.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_test = ds_test.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)

del ds_train
del ds_val
del ds_test
gc.collect()

torch.cuda.empty_cache()

Tokenizing...


Map:   0%|          | 0/63603 [00:00<?, ? examples/s]

Map:   0%|          | 0/7067 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [27]:
model = AutoModelForSequenceClassification.from_pretrained('../models/codonBERT-multi_1/checkpoint-99500', num_labels=3)

In [28]:
training_args = TrainingArguments(
    output_dir='./models/codonBERT-multi_{}'.format(RUN),
    learning_rate=2e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    tokenizer=tokenizer,
)

In [29]:
out = trainer.predict(test_dataset=tokenized_ds_test)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
out

PredictionOutput(predictions=array([[-2.5255616 ,  5.121209  , -3.4514704 ],
       [-2.1221962 ,  3.222921  , -1.7396102 ],
       [ 1.9877458 , -0.7484707 , -1.2706597 ],
       [ 2.665568  , -0.97397494, -1.8267919 ],
       [ 2.5613768 , -0.7155683 , -2.178398  ],
       [-0.8920546 , -2.7351243 ,  2.9274845 ],
       [ 0.7599996 , -0.4997601 , -0.10353538],
       [ 4.082304  , -2.512336  , -2.126844  ],
       [ 5.2003417 , -2.645346  , -2.6180964 ],
       [-0.6895545 ,  3.3915231 , -3.2710204 ],
       [ 1.2924222 ,  1.420453  , -4.0864916 ],
       [ 0.5179515 , -1.2427034 ,  0.84787834],
       [-4.1891    , -4.013499  ,  6.843054  ],
       [-1.3392583 ,  2.9082913 , -1.9447596 ],
       [-2.4363537 ,  5.179916  , -3.7035263 ],
       [ 0.07737833, -0.14201617,  0.13429223],
       [ 1.6633668 , -1.6811367 ,  0.24253425],
       [-2.6283293 , -1.9312181 ,  3.8073454 ],
       [ 3.7876704 , -1.9375799 , -2.237544  ],
       [-3.2756495 , -2.7044613 ,  5.829259  ],
       [-0.

In [43]:
scores = compute_metrics(out)
scores
# with open('./results/codonBERT-multi_{}.txt'.format(RUN),'w') as data: 
#       data.write(str(scores))

{'auroc': 0.70928387224757,
 'accuracy': <tf.Tensor: shape=(), dtype=float32, numpy=0.5492958>,
 'precision': 0.5492957746478874,
 'recall': 0.5492957746478874}