In [1]:
import pandas as pd
import torch
import os
import numpy as np
import helpers
from datasets import Dataset
from transformers import DistilBertConfig, DistilBertTokenizerFast,  DistilBertForSequenceClassification,  PreTrainedTokenizerFast
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

In [2]:
import importlib
importlib.reload(helpers)

<module 'helpers' from '/lustre/isaac/scratch/ababjac/codon-sentiment-score/bert-scripts/helpers.py'>

In [3]:
PATH='/lustre/isaac/proj/UTK0196/codon-expression-data/fullTableForTrainning/'

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [5]:
filelist = os.listdir(PATH) 
df_list = [pd.read_csv(PATH+file) for file in filelist]
df = pd.concat(df_list)

In [6]:
df

Unnamed: 0,GeneName,median_exp,Sequence
0,Gnai3,35.00,ATGGGCTGCACGTTGAGCGCCGAGGACAAGGCGGCGGTGGAGCGGA...
1,Cdc45,1.45,ATGTTCGTGACCGATTTCCGCAAGGAGTTCTACGAGACGGTCCACA...
2,Scml2,1.00,ATGGCAGAGCCTGCCACTGGAGTGCAACTTGCTGGTTCTGGAGAGC...
3,Apoh,0.10,ATGGTTTCCCCGGTGCTCGCCTTGTTCTCCGCCTTCCTCTGCCATG...
4,Narf,10.00,ATGAAGTGTGAGCACTGCACACGAAAGGAATGTAGTAAAAAATCAA...
...,...,...,...
5031,OPT2,8.00,ATGAGTGAAACAGTCAAAGATAAAGTTATAATTGATGAGAAGGTAT...
5032,SGE1,60.00,ATGAAGAGTACTTTGAGTTTAACTTTATGTGTTATATCGCTTCTAT...
5033,ARR1,39.00,ATGGCAAAACCGCGTGGAAGAAAAGGCGGCAGGAAGCCTTCACTTA...
5034,ARR2,13.00,ATGGTAAGTTTCATAACGTCTAGGCAACTCAAGGGCCTAATTGAAA...


In [7]:
df = helpers.add_codons_to_df(df, 'Sequence')
labels = np.where(df['median_exp'] > np.median(df['median_exp'].values), 1, 0)

classification_df = pd.DataFrame({'text' : df['codons_cleaned'], 'label' : labels})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['codons_cleaned'] = get_codon_list(df[col])


In [8]:
MAX = int(np.median([(len(elem) / 3) for elem in df['codons_cleaned']]))
MAX

502

In [9]:
def compute_metrics(epred):
    # Computes metrics from specialized output from huggingface

    preds = np.exp(epred[0]) / np.sum(np.exp(epred[0]), axis = 0)
    labels = epred[1]

    metrics = {}
    metrics['auprc'] = average_precision_score(labels, preds[:,1])
    metrics['auroc'] = roc_auc_score(labels, preds[:,1])

    return metrics

In [10]:
ds = Dataset.from_pandas(classification_df)

In [11]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [12]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [13]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [14]:
tokenizer.mask_token="[MASK]"

In [15]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [16]:
trainer = trainers.WordPieceTrainer(special_tokens=special_tokens)

In [17]:
batch_size = 1000
all_texts = [ds[i : i + batch_size]['text'] for i in range(0, len(ds), batch_size)]

def batch_iterator():
    for i in range(0, len(ds), batch_size):
        yield ds[i : i + batch_size]['text']

In [18]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)






In [19]:
fast_tokenizer = DistilBertTokenizerFast(tokenizer_object=tokenizer)

In [20]:
fast_tokenizer.mask_token='[MASK]'
fast_tokenizer.pad_token='[PAD]'
fast_tokenizer.cls_token='[CLS]'
fast_tokenizer.unk_token='[UNK]'

In [21]:
fast_tokenizer.save_pretrained('../tokenizers/codonBERT2')

('../tokenizers/codonBERT2/tokenizer_config.json',
 '../tokenizers/codonBERT2/special_tokens_map.json',
 '../tokenizers/codonBERT2/vocab.txt',
 '../tokenizers/codonBERT2/added_tokens.json',
 '../tokenizers/codonBERT2/tokenizer.json')

In [22]:
#testing it
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('../tokenizers/codonBERT2')

In [23]:
tokenized_ds = ds.map(lambda d : tokenizer(d['text'], truncation=True), batched=True)

Map:   0%|          | 0/70741 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [24]:
tokenized_ds['input_ids']

[[40,
  56,
  80,
  84,
  53,
  63,
  46,
  30,
  42,
  32,
  82,
  82,
  44,
  30,
  92,
  63,
  32,
  40,
  48,
  42,
  86,
  43,
  53,
  92,
  30,
  42,
  88,
  30,
  36,
  82,
  46,
  36,
  29,
  44,
  32,
  37,
  37,
  37,
  68,
  56,
  47,
  49,
  29,
  57,
  65,
  36,
  72,
  59,
  48,
  44,
  36,
  38,
  40,
  36,
  48,
  41,
  73,
  30,
  42,
  56,
  67,
  66,
  30,
  42,
  29,
  83,
  36,
  38,
  67,
  36,
  87,
  54,
  69,
  61,
  63,
  39,
  60,
  41,
  38,
  64,
  48,
  41,
  55,
  48,
  78,
  91,
  46,
  40,
  49,
  92,
  53,
  32,
  41,
  35,
  50,
  88,
  29,
  57,
  46,
  70,
  55,
  35,
  35,
  46,
  91,
  38,
  77,
  50,
  54,
  77,
  47,
  88,
  72,
  47,
  29,
  29,
  49,
  69,
  40,
  60,
  66,
  29,
  85,
  55,
  56,
  44,
  41,
  36,
  90,
  77,
  76,
  91,
  35,
  56,
  88,
  87,
  38,
  55,
  80,
  50,
  63,
  89,
  64,
  89,
  29,
  67,
  38,
  68,
  39,
  35,
  57,
  47,
  66,
  61,
  61,
  85,
  39,
  35,
  53,
  35,
  70,
  78,
  64,
  38,
  59,
  43,
  61