In [3]:
import pandas as pd
import torch
import os
import numpy as np
import helpers
from datasets import Dataset
from transformers import BertConfig, BertTokenizerFast,  BertForSequenceClassification,  PreTrainedTokenizerFast
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

In [2]:
import importlib
importlib.reload(helpers)

<module 'helpers' from '/lustre/isaac/scratch/ababjac/codon-sentiment-score/bert-scripts/helpers.py'>

In [4]:
PATH='../data/amino/'

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [6]:
filelist = os.listdir(PATH) 
df_list = [pd.read_csv(PATH+file, index_col=0) for file in filelist]
df = pd.concat(df_list)

In [7]:
df

Unnamed: 0,GeneName,median_exp,Sequence,codons_cleaned,amino_acids
0,Gnai3,35.00,ATGGGCTGCACGTTGAGCGCCGAGGACAAGGCGGCGGTGGAGCGGA...,ATG GGC TGC ACG TTG AGC GCC GAG GAC AAG GCG GC...,MGCTLSAEDKAAVERSKMIDRNLREDGEKAAKEVKLLLLGAGESGK...
1,Cdc45,1.45,ATGTTCGTGACCGATTTCCGCAAGGAGTTCTACGAGACGGTCCACA...,ATG TTC GTG ACC GAT TTC CGC AAG GAG TTC TAC GA...,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...
2,Scml2,1.00,ATGGCAGAGCCTGCCACTGGAGTGCAACTTGCTGGTTCTGGAGAGC...,ATG GCA GAG CCT GCC ACT GGA GTG CAA CTT GCT GG...,MAEPATGVQLAGSGELVAEPGPSSTEAREPATGVQLAGSGQLVAEQ...
3,Apoh,0.10,ATGGTTTCCCCGGTGCTCGCCTTGTTCTCCGCCTTCCTCTGCCATG...,ATG GTT TCC CCG GTG CTC GCC TTG TTC TCC GCC TT...,MVSPVLALFSAFLCHVAIAGRICPKPDDLPFATVVPLKTSYDPGEQ...
4,Narf,10.00,ATGAAGTGTGAGCACTGCACACGAAAGGAATGTAGTAAAAAATCAA...,ATG AAG TGT GAG CAC TGC ACA CGA AAG GAA TGT AG...,MKCEHCTRKECSKKSKTDDQENVSSDGAQPSDGASPAKESEEKGEF...
...,...,...,...,...,...
5031,OPT2,8.00,ATGAGTGAAACAGTCAAAGATAAAGTTATAATTGATGAGAAGGTAT...,ATG AGT GAA ACA GTC AAA GAT AAA GTT ATA ATT GA...,MSETVKDKVIIDEKVSTKGTVDYAEGAEYSERLSNHSSDFSQWYTD...
5032,SGE1,60.00,ATGAAGAGTACTTTGAGTTTAACTTTATGTGTTATATCGCTTCTAT...,ATG AAG AGT ACT TTG AGT TTA ACT TTA TGT GTT AT...,MKSTLSLTLCVISLLLTLFLAALDIVIVVTLYDTIGIKFHDFGNIG...
5033,ARR1,39.00,ATGGCAAAACCGCGTGGAAGAAAAGGCGGCAGGAAGCCTTCACTTA...,ATG GCA AAA CCG CGT GGA AGA AAA GGC GGC AGG AA...,MAKPRGRKGGRKPSLTPPKNKRAAQLRASQNAFRKRKLERLEELEK...
5034,ARR2,13.00,ATGGTAAGTTTCATAACGTCTAGGCAACTCAAGGGCCTAATTGAAA...,ATG GTA AGT TTC ATA ACG TCT AGG CAA CTC AAG GG...,MVSFITSRQLKGLIENQRKDFQVVDLRREDFARDHITNAWHVPVTA...


In [32]:
#df = helpers.add_codons_to_df(df, 'Sequence')
labels = np.where(df['median_exp'] > np.median(df['median_exp'].values), 1, 0)
df['amino_acid_spaces'] = [' '.join(seq) for seq in df['amino_acids']]

classification_df = pd.DataFrame({'text' : df['amino_acid_spaces'], 'label' : labels})

In [9]:
MAX = int(np.median([(len(elem) / 3) for elem in df['codons_cleaned']]))
MAX

502

In [18]:
def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    preds = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    labels = epred[1]

    metrics = {}
    metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, preds[:,1])

    return metrics

In [33]:
ds = Dataset.from_pandas(classification_df)

In [20]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [21]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [22]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [23]:
tokenizer.mask_token="[MASK]"

In [24]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [25]:
trainer = trainers.WordPieceTrainer(special_tokens=special_tokens)

In [26]:
batch_size = 1000
all_texts = [ds[i : i + batch_size]['text'] for i in range(0, len(ds), batch_size)]

def batch_iterator():
    for i in range(0, len(ds), batch_size):
        yield ds[i : i + batch_size]['text']

In [27]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)






In [28]:
fast_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [29]:
fast_tokenizer.mask_token='[MASK]'
fast_tokenizer.pad_token='[PAD]'
fast_tokenizer.cls_token='[CLS]'
fast_tokenizer.unk_token='[UNK]'

In [30]:
fast_tokenizer.save_pretrained('../tokenizers/aminoBERT')

('../tokenizers/aminoBERT/tokenizer_config.json',
 '../tokenizers/aminoBERT/special_tokens_map.json',
 '../tokenizers/aminoBERT/vocab.txt',
 '../tokenizers/aminoBERT/added_tokens.json',
 '../tokenizers/aminoBERT/tokenizer.json')

In [34]:
#testing it
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('../tokenizers/aminoBERT')

In [35]:
tokenized_ds = ds.map(lambda d : tokenizer(d['text'], truncation=True, padding=True, max_length=512), batched=True)

Map:   0%|          | 0/70740 [00:00<?, ? examples/s]

In [36]:
tokenized_ds['input_ids']

[[16,
  11,
  7,
  22,
  15,
  21,
  6,
  9,
  8,
  14,
  6,
  6,
  23,
  9,
  20,
  21,
  14,
  16,
  13,
  8,
  20,
  17,
  15,
  20,
  9,
  8,
  11,
  9,
  14,
  6,
  6,
  14,
  9,
  23,
  14,
  15,
  15,
  15,
  15,
  11,
  6,
  11,
  9,
  21,
  11,
  14,
  21,
  22,
  13,
  23,
  14,
  19,
  16,
  14,
  13,
  13,
  12,
  9,
  8,
  11,
  25,
  21,
  9,
  8,
  9,
  7,
  14,
  19,
  25,
  14,
  23,
  23,
  23,
  25,
  21,
  17,
  22,
  13,
  19,
  21,
  13,
  13,
  6,
  13,
  13,
  20,
  6,
  16,
  11,
  20,
  15,
  14,
  13,
  8,
  10,
  11,
  9,
  21,
  6,
  20,
  6,
  8,
  8,
  6,
  20,
  19,
  15,
  10,
  23,
  15,
  6,
  11,
  21,
  6,
  9,
  9,
  11,
  23,
  16,
  22,
  21,
  9,
  15,
  6,
  11,
  23,
  13,
  14,
  20,
  15,
  24,
  20,
  8,
  11,
  11,
  23,
  19,
  6,
  7,
  10,
  21,
  20,
  21,
  20,
  9,
  25,
  19,
  15,
  17,
  8,
  21,
  6,
  21,
  25,
  25,
  15,
  17,
  8,
  15,
  8,
  20,
  13,
  21,
  19,
  22,
  17,
  25,
  13,
  18,
  22,
  19,
  19,
  8,
  23,
  