In [14]:
import pandas as pd
import numpy as np
import torch
import pickle
import os
import helpers

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
import importlib
importlib.reload(helpers)

<module 'helpers' from '/lustre/isaac/scratch/ababjac/codon-sentiment-score/helpers.py'>

In [36]:
def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    probs = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    preds = np.round(probs[:,0])
    
    print(probs[:,1], preds)
    labels = epred[1]

    metrics = {}
    #metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, probs[:,1])
    #metrics['accuracy'] = accuracy_score(labels, preds)
    #metrics['precision'] = precision_score(labels, preds)
    #metrics['recall'] = recall_score(labels, preds)

    return metrics

In [9]:
PATH='/lustre/isaac/proj/UTK0196/codon-expression-data/fullTableForTrainning/'

In [18]:
filelist = os.listdir(PATH) 
df_list = [pd.read_csv(PATH+file) for file in filelist]
df = pd.concat(df_list)

df = helpers.add_codons_to_df(df, 'Sequence')
labels = np.where(df['median_exp'] > np.median(df['median_exp'].values), 1, 0)

classification_df = pd.DataFrame({'text' : df['codons_cleaned'], 'label' : labels})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['codons_cleaned'] = get_codon_list(df[col])


In [19]:
trunc_len = 2048

df_train, df_test = train_test_split(classification_df, test_size=0.2, random_state=1234)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=1234)

ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)


print('Tokenizing...')
config = AutoConfig.from_pretrained('distilbert-base-uncased', max_position_embeddings=2048)
tokenizer = AutoTokenizer.from_pretrained('./tokenizers/codonBERT', model_max_length=trunc_len, padding_side='left', truncation_side='right')


tokenized_ds_train = ds_train.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_val = ds_val.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_test =  ds_test.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)

Tokenizing...


Map:   0%|          | 0/50932 [00:00<?, ? examples/s]

Map:   0%|          | 0/5660 [00:00<?, ? examples/s]

Map:   0%|          | 0/14149 [00:00<?, ? examples/s]

In [21]:
model = AutoModelForSequenceClassification.from_config(config) #randomly initialize it
RUN = 0
training_args = TrainingArguments(
    output_dir='./models/codonBERT-binary_{}'.format(RUN),
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    tokenizer=tokenizer,
)


print('Training...')
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text, __index_level_0__. If token_type_ids, text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 50932
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1592
  Number of trainable parameters = 68134658
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Step,Training Loss


KeyboardInterrupt: 

In [23]:
#trainer.evaluate()
out = trainer.predict(test_dataset=tokenized_ds_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text, __index_level_0__. If token_type_ids, text, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 14149
  Batch size = 32


In [37]:
scores = compute_metrics(out)

[6.9903341e-05 7.0558432e-05 7.0396345e-05 ... 7.0547801e-05 7.0555980e-05
 7.0764756e-05] [0. 0. 0. ... 0. 0. 0.]
