In [12]:
import pandas as pd
import numpy as np
import torch
import pickle

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

In [9]:
PATH='/lustre/isaac/proj/UTK0196/deep-surface-protein-data/'

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [11]:
df = pd.read_csv(PATH+'M0059E_training_set.tsv', delimiter=',', header=0)

In [None]:
#df = df.sample(115000, random_state=1097253) #random set
#df = df[(df['percent.identity'] >= 74.5) & (df['percent.identity'] < 89.6)] #middle set
#df = df[(df['percent.identity'] >= 89.6) | (df['percent.identity'] < 74.5)] #edge set

In [4]:
#df.columns

Index(['deep.ID', 'surf.ID', 'deep.sequence', 'surf.sequence',
       'percent.identity', 'alignment.length', 'bitscore'],
      dtype='object')

In [5]:
#df.shape

(5000, 7)

In [6]:
#df

Unnamed: 0,deep.ID,surf.ID,deep.sequence,surf.sequence,percent.identity,alignment.length,bitscore
0,SRR7066492_k141_369890_2,SRR7066493_k141_874768_2,MTENERKFTLVGLGEILWDVLPDGKQLGGAPANFAYHAQALGGRGI...,MTERGKYVCVGLGEILWDMLPEGKQLGGAPANFAYHAQALRGQGVV...,70.7,256,360.1
1,SRR7066492_k141_369890_2,SRR7066493_k141_1284874_2,MTENERKFTLVGLGEILWDVLPDGKQLGGAPANFAYHAQALGGRGI...,MTVDGKYLCVGLGEILWDMLPGGKQLGGAPANFAYHSQALGAQGVV...,70.4,250,355.1
2,SRR7066492_k141_443867_2,SRR7066493_k141_1612491_4,MKVALLGLLQSGKSSIFAGLSGKSIPPVGSTAIEEAIVPVPDERLD...,MKVALVGLLQSGKSTILASLSGKAIPAIGSASIEEAIVSVPDDRFD...,78.4,356,559.3
3,SRR7066492_k141_443867_2,SRR7066493_k141_1619980_3,MKVALLGLLQSGKSSIFAGLSGKSIPPVGSTAIEEAIVPVPDERLD...,MKVALIGLLQSGKSTILASLTGKAIPAIGSASIEETIVPVPDERFD...,78.1,356,555.1
4,SRR7066492_k141_443867_2,SRR7066493_k141_1671894_1,MKVALLGLLQSGKSSIFAGLSGKSIPPVGSTAIEEAIVPVPDERLD...,MKVALIGLLQSGKSTILASLTGKAVPAAGSASIEEAIVPVPDERFD...,77.5,356,550.4
...,...,...,...,...,...,...,...
4995,SRR7066492_k141_666341_2,SRR7066493_k141_242132_1,MDKDTVILVVDDEREHADGIAEAMEKLCGRAIAVYNGADALEIVRN...,MGQKEGVILIVDDERDHADGLAESLEKLCARAIAVYDGTDALQILR...,75.1,370,548.1
4996,SRR7066492_k141_666341_2,SRR7066493_k141_1618303_2,MDKDTVILVVDDEREHADGIAEAMEKLCGRAIAVYNGADALEIVRN...,MAQKAGVILVVDDERDHADGIVESLEKLCTQAIAVYNGTDALEIVR...,74.7,367,538.1
4997,SRR7066492_k141_666341_2,SRR7066493_k141_1439149_3,MDKDTVILVVDDEREHADGIAEAMEKLCGRAIAVYNGADALEIVRN...,MKQKANIILVVDDERDHADGIAEALEKLCTKAIAVYTGKDALEIVR...,73.8,370,535.4
4998,SRR7066492_k141_666341_2,SRR7066493_k141_1723797_3,MDKDTVILVVDDEREHADGIAEAMEKLCGRAIAVYNGADALEIVRN...,MAQKAGVILVVDDERDHADGIVESLEKLCTRAIAVYNGTDAMEIVR...,74.7,367,532.7


In [16]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=1234)
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=1234)

In [17]:
# Load one split:
#split_no = 1
#inds = pickle.load(open('./splits/splits.pkl', 'rb'))

#for train_inds, test_inds in inds[split_no]:
#    train_set = df.iloc[train_inds,:]
#    test_set = df.iloc[test_inds,:]
    
#    train_seqs_list = train_set['surf.sequence'].tolist() + train_set['deep.sequence'].tolist()
#    train_seqs_labels = np.concatenate([np.zeros(train_set.shape[0]), np.ones(train_set.shape[0])])
surf_series_train = df_train['surf.sequence']
deep_series_train = df_train['deep.sequence']

surf_series_val = df_val['surf.sequence']
deep_series_val = df_val['deep.sequence']

surf_series_test = df_test['surf.sequence']
deep_series_test = df_test['deep.sequence']

classification_df_train = pd.DataFrame({'text' : surf_series_train.append(deep_series_train, ignore_index=True), 'label' : [0]*surf_series_train.size+[1]*deep_series_train.size})
classification_df_val = pd.DataFrame({'text' : surf_series_val.append(deep_series_val, ignore_index=True), 'label' : [0]*surf_series_val.size+[1]*deep_series_val.size})
classification_df_test = pd.DataFrame({'text' : surf_series_test.append(deep_series_test, ignore_index=True), 'label' : [0]*surf_series_test.size+[1]*deep_series_test.size})

  classification_df_train = pd.DataFrame({'text' : surf_series_train.append(deep_series_train, ignore_index=True), 'label' : [0]*surf_series_train.size+[1]*deep_series_train.size})
  classification_df_val = pd.DataFrame({'text' : surf_series_val.append(deep_series_val, ignore_index=True), 'label' : [0]*surf_series_val.size+[1]*deep_series_val.size})
  classification_df_test = pd.DataFrame({'text' : surf_series_test.append(deep_series_test, ignore_index=True), 'label' : [0]*surf_series_test.size+[1]*deep_series_test.size})


In [18]:
def overlap_sequence(seq, word_length, overlap):
    if overlap >= word_length:
        print('Overlap must be less than word length')
        return
    
    for i in range(0, len(seq)-overlap, word_length-overlap):
        yield seq[i:i+word_length]
        
def get_overlap_array(seq, word_length=5, overlap=2):
    return np.array(list(overlap_sequence(seq, word_length, overlap)))

def get_overlap_string(seq, word_length=5, overlap=2):
    return ' '.join(list(overlap_sequence(seq, word_length, overlap)))

def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    preds = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    labels = epred[1]

    metrics = {}
    metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, preds[:,1])

    return metrics

In [19]:
classification_df_train['text'] = classification_df_train['text'].transform(get_overlap_string)
classification_df_val['text'] = classification_df_val['text'].transform(get_overlap_string)
classification_df_test['text'] = classification_df_test['text'].transform(get_overlap_string)
med_len = int(np.median([len(elem) for elem in classification_df_train['text']]))
#classification_df

In [20]:
ds_train = Dataset.from_pandas(classification_df_train)
ds_val = Dataset.from_pandas(classification_df_val)
ds_test = Dataset.from_pandas(classification_df_test)

In [21]:
tokenizer = AutoTokenizer.from_pretrained('tokenizers/AA-overlap-5_2', model_max_length=med_len, padding_side='left', truncation_side='right')

In [22]:
tokenized_ds_train = ds_train.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_val = ds_val.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)
tokenized_ds_test = ds_test.map(lambda d : tokenizer(d['text'], truncation=True, padding=True), batched=True)

Map:   0%|          | 0/331854 [00:00<?, ? examples/s]

Map:   0%|          | 0/36874 [00:00<?, ? examples/s]

Map:   0%|          | 0/92184 [00:00<?, ? examples/s]

In [24]:
#init_splits = tokenized_ds.train_test_split(test_size=0.2)

#tmp = init_splits['train']
#test_ds = init_splits['test']

#splits = tmp.train_test_split(test_size=0.1)
#train_ds = splits['train']
#val_ds = splits['test']

In [23]:
model = AutoModelForSequenceClassification.from_pretrained('BERT-random/checkpoint-14500', num_labels=2)

In [25]:
training_args = TrainingArguments(
    output_dir='./custom-model-overlap-5_2',
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    tokenizer=tokenizer,
    #data_collator=data_collator,
)

In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 331854
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 103710
  Number of trainable parameters = 66955010
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: 

In [28]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 36874
  Batch size = 32


KeyboardInterrupt: 

In [29]:
out = trainer.predict(test_dataset=tokenized_ds_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 92184
  Batch size = 32


KeyboardInterrupt: 

In [30]:
scores = compute_metrics(out)
with open('./results/BERT-custom-5_2-scores.txt','w') as data: 
      data.write(str(scores))
print(scores)

NameError: name 'out' is not defined

In [None]:
#trainer.save_pretrained('./models/initial')