In [4]:
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import AutoTokenizer, EsmForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
PATH='/lustre/isaac/proj/UTK0196/deep-surface-protein-data/'

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [25]:
df = pd.read_csv(PATH+'M0059E_training_set.tsv', delimiter=',', header=0)

In [26]:
df = df.sample(115000, random_state=1097253) #random set
#df = df[(df['percent.identity'] >= 74.5) & (df['percent.identity'] < 89.6)] #middle set
#df = df[(df['percent.identity'] >= 89.6) | (df['percent.identity'] < 74.5)] #edge set

In [27]:
surf_series = df['surf.sequence']
deep_series = df['deep.sequence']

classification_df = pd.DataFrame({'text' : surf_series.append(deep_series, ignore_index=True), 'label' : [0]*surf_series.size+[1]*deep_series.size})
#classification_df

Unnamed: 0,text,label
0,MDSLSQIVIPTVTETGARGERAYDIYSLLLRERIIFLGTAINDQVA...,0
1,MRDLFTGLYTEFTSVPASSFYTALGGRLYLEEAPQGTQFPYAVFSL...,0
2,MYKRLKKELNINIEEKDGQLFLGDKKKEMRLVMLRPNEIMEFCEFT...,0
3,MKTAKQIRSEFVDFFRGKNHEIVPSSPIVPIGDETLLFANAGMNQF...,0
4,TSAEEQPYQSEPAQLPVKAKVQMLEAGVVLDDTSQRSGTYIQMDNT...,0
...,...,...
229995,MRKLGRLLTAMVTPFDAEGRVDYQQAKNLAGALLDSGSDGLIVSGT...,1
229996,METLALIGEKTFRGGAHLPEKKELTSECAISAGPAIKQATVMLSQH...,1
229997,MNTHSGDRANTKDTITGRERVLRALAYKKVDRVPVDLGGTLCSGAH...,1
229998,MREDKGDKTLFSRSEKANIDSVAPVAFRMRPRNLDEFLGQRHFLGP...,1


In [28]:
def overlap_sequence(seq, word_length, overlap):
    if overlap >= word_length:
        print('Overlap must be less than word length')
        return
    
    for i in range(0, len(seq)-overlap, word_length-overlap):
        yield seq[i:i+word_length]
        
def get_overlap_array(seq, word_length=5, overlap=2):
    return np.array(list(overlap_sequence(seq, word_length, overlap)))

def get_overlap_string(seq, word_length=5, overlap=2):
    return ' '.join(list(overlap_sequence(seq, word_length, overlap)))

def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    preds = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    labels = epred[1]

    metrics = {}
    metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, preds[:,1])

    return metrics

In [9]:
#classification_df['text'] = classification_df['text'].transform(get_overlap_string)
#classification_df

In [29]:
ds = Dataset.from_pandas(classification_df)

In [30]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

loading file vocab.txt from cache at /nfs/home/ababjac/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /nfs/home/ababjac/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/special_tokens_map.json
loading file tokenizer_config.json from cache at /nfs/home/ababjac/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/tokenizer_config.json


In [9]:
#tokenizer.is_fast

False

In [31]:
tokenized_ds = ds.map(lambda d : tokenizer(d['text'], return_tensors="pt", padding=True), batched=True)

100%|█████████████████████████████████████████| 230/230 [02:46<00:00,  1.38ba/s]


In [32]:
init_splits = tokenized_ds.train_test_split(test_size=0.2)

tmp = init_splits['train']
test_ds = init_splits['test']

splits = tmp.train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [33]:
model = EsmForSequenceClassification.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device)
#logits = model(**inputs).logits
#predicted_class_id = logits.argmax().item()

loading configuration file config.json from cache at /nfs/home/ababjac/.cache/huggingface/hub/models--facebook--esm2_t6_8M_UR50D/snapshots/97bbd7b3dc8acb222027cde0d8e911f3de78d5b8/config.json
Model config EsmConfig {
  "_name_or_path": "/tmp/facebook/esm2_t6_8M_UR50D",
  "architectures": [
    "EsmForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "emb_layer_norm_before": false,
  "esmfold_config": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 320,
  "initializer_range": 0.02,
  "intermediate_size": 1280,
  "is_folding_model": false,
  "layer_norm_eps": 1e-05,
  "mask_token_id": 32,
  "max_position_embeddings": 1026,
  "model_type": "esm",
  "num_attention_heads": 20,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "rotary",
  "token_dropout": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "use_cache": true,
  "vocab_list": null,
  "vocab_size": 33
}

loading weight

In [36]:
training_args = TrainingArguments(
    output_dir='./ESM-random',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    #data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `EsmForSequenceClassification.forward` and have been ignored: text. If text are not expected by `EsmForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 165600
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 207000


Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
out = trainer.predict(test_dataset=test_ds)

In [None]:
scores = compute_metrics(out)
with open(PATH+'ESM-random-test.txt','w') as data: 
      data.write(str(scores))

In [17]:
#trainer.save_pretrained('./models/initial-esm')

AttributeError: 'Trainer' object has no attribute 'save_pretrained'