<a href="https://colab.research.google.com/github/anihab/tokenization/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install libraries
import pandas as pd
import numpy as np

# Install Biopython
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
    # for byte pair encoding
    !pip install tokenizers
    !pip install transformers
except ImportError:
    pass

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/3.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully inst

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Resources:
# https://www.tutorialspoint.com/biopython/biopython_sequence_io_operations.htm
# https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
# https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt
# https://huggingface.co/docs/transformers/model_doc/roberta
# https://huggingface.co/docs/tokenizers/pipeline
# https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface

In [None]:
import os

from Bio import SeqIO
from google.colab import files

MAX_TOKENS = 510

## Given a phage directory and a bacteria directory, tokenize all fasta files according to method of choice

def read_files(phage_dir, bacteria_dir, method, *args, **kwargs):
  k = kwargs.get('k', None)
  for filename in os.listdir(phage_dir):
    f = os.path.join(phage_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 1, method, k)
  for filename in os.listdir(bacteria_dir):
    f = os.path.join(bacteria_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 0, method, k)

## Tokenizes a sequence given a fasta file and max length

"""\
Runs fasta files through tokenizer and adds the label of 1 for phage and
0 for bacteria. Then shuffles the rows in the dataframe and saves to CSV

Input:
  phage -- str, path to phage fasta file
  bacteria -- str, path to bacteria fasta file
  method -- str, tokenization method of choice
  k -- int, length of k if using kmer tokenization
"""
def tokenize(filepath, label, method, *args, **kwargs):
  sequences = []
  tokens = []

  k = kwargs.get('k', None)
  filename = os.path.basename(filepath)
  filename = filename.split('.')[0]

  if method == 'codon':
    max_length = MAX_TOKENS * 3
  elif method == 'kmer':
    max_length = MAX_TOKENS - (k - 1)
  elif method == 'bpe':
    max_length = MAX_TOKENS

  # Process data to get sequences of appropriate length
  df = preprocess_data(filepath, max_length)
  sequences = df['sequence'].values.tolist()

  if method == 'bpe':
    train_bpe_tokenizer(sequences)

  # Tokenize according to chosen method
  for seq in range(len(sequences)):
    if method == 'codon':
      tokens.append(seq2codon(sequences[seq]))
    elif method == 'kmer':
      tokens.append(seq2kmer(sequences[seq], k))
    elif method == 'bpe':
      tokens.append(seq2bpe(sequences[seq]))
  df['tokenized'] = tokens
  df['label'] = [label] * len(tokens)

  # Shuffle and save to csv
  df = df.sample(frac=1).reset_index(drop=True)
  write_csv(filename, df)
  return df

"""\
Read fasta file and truncate sequences to appropriate length, returns dataframe

Input:
  filepath -- str, path to fasta file
  max_length -- int, maximum sequence length

Returns:
  df -- dataframe, includes the > input line, start position, and sequence
"""
def preprocess_data(filepath, max_length):
  records = []
  for record in SeqIO.parse(filepath, 'fasta'):
    filename = os.path.basename(filepath)
    name = filename.split('.')[0]
    segment = str(record.name)
    seq = str(record.seq).upper()
    pos = 0
    # Truncate sequences if longer than max_length
    while len(seq) > max_length:
      records.append(                  # add subsequence up to max_length
        {
          'name': name,
          'segment': segment,
          'start': pos,
          'sequence': seq[:max_length]
        }
      )
      seq = seq[max_length:]           # sequence continuing from max_length
      pos += max_length
    records.append(
        {
          'name': name,
          'segment': segment,
          'start': pos,
          'sequence': seq
        }
    )
  df = pd.DataFrame(data=records)
  return df

"""\
Read in sequences and tokens to attach labels and return dataframe

Input:
  sequences -- list, original sequences
  tokens -- list, tokenized sequences
  label -- int, 1 for phage or 0 for bacteria

Returns:
  df -- dataframe
"""
def attach_labels(sequences, tokens, label):
  d = []
  for i in range(len(tokens)):
    d.append(
        {
          'sequence': sequences[i],
          'tokenized': tokens[i],
          'label': label
        }
    )
  df = pd.DataFrame(data=d)
  return df

"""\
Save the given dataframe to two separate csv files:
1. full_output.csv includes the name, start position, sequence, tokenized
   sequence, and label.
2. tokenized_output.csv includes the tokenized sequence and the label.

Input:
  df -- dataframe, full dataframe of tokenized sequences
"""
def write_csv(filename, df):
  df.to_csv(filename + '_full_output.csv', encoding='utf-8', index=False)
  tokenized = df[['tokenized', 'label']]
  tokenized.to_csv(filename + '_tokenized_output.csv', encoding='utf-8', index=False)
  #files.download(filename + '_full_output.csv')
  #files.download(filename + '_tokenized_output.csv')

## Different tokenization methods

"""\
Convert a sequence to codons

Input:
  seq -- str, original sequence

Returns:
  codons -- str, codons separated by space
"""
def seq2codon(seq):
  codon = [seq[i:i+3] for i in range(0,len(seq),3)]
  codons = " ".join(codon)
  return codons

"""\
Convert a sequence to kmers

Input:
  seq -- str, original sequence
  k -- int, kmer of length k

Returns:
  kmers -- str, kmers separated by space
"""
def seq2kmer(seq, k):
  kmer = [seq[i:i+k] for i in range(len(seq)+1-k)]
  kmers = " ".join(kmer)
  return kmers

In [None]:
# byte pair encoding

from tokenizers import Tokenizer, models, trainers, normalizers
from transformers import PreTrainedTokenizerFast

def train_bpe_tokenizer(sequences):
  tokenizer = Tokenizer(models.BPE())

  # Customize the tokenizer to handle DNA sequences
  tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC()])
  #tokenizer.add_tokens(["A", "C", "G", "T"])

  # Train the tokenizer on your DNA sequences
  trainer = trainers.BpeTrainer(vocab_size=50000)
  tokenizer.train_from_iterator(sequences, trainer=trainer)

  tokenizer.save("dna_tokenizer.json")

"""\
Convert a sequence to byte pair encodings

Input:
  seq -- str, original sequence

Returns:
  output -- str, decoded tokens separated by a space
"""
def seq2bpe(sequence):
  tokenizer = PreTrainedTokenizerFast(tokenizer_file="dna_tokenizer.json")
  encoded_input = tokenizer(sequence, return_tensors="pt")
  token_ids = encoded_input.input_ids
  output = " ".join(tokenizer.batch_decode(token_ids))
  return output

In [4]:
## Model Configs
from transformers import AutoModel, AutoTokenizer

In [None]:
# DNABERT_6
  # Load the model and tokenizer
model = AutoModel.from_pretrained("zhihan1996/DNA_bert_6")
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

In [None]:
# DNABERT-2
  # Load the model and tokenizer
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M")
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/862 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of the model checkpoint at zhihan1996/DNABERT-2-117M were not used when initializing BertModel: ['bert.encoder.layer.2.mlp.wo.weight', 'bert.encoder.layer.9.attention.self.Wqkv.bias', 'bert.encoder.layer.2.mlp.gated_layers.weight', 'bert.encoder.layer.9.mlp.wo.weight', 'bert.encoder.layer.11.mlp.layernorm.weight', 'bert.encoder.layer.0.mlp.wo.bias', 'bert.encoder.layer.8.mlp.wo.weight', 'bert.encoder.layer.5.mlp.layernorm.weight', 'bert.encoder.layer.1.mlp.gated_layers.weight', 'bert.encoder.layer.3.mlp.wo.weight', 'bert.encoder.layer.7.mlp.wo.weight', 'bert.encoder.layer.8.attention.self.Wqkv.bias', 'cls.predictions.transform.dense.weight', 'bert.encoder.layer.9.mlp.layernorm.weight', 'bert.encoder.layer.11.attention.self.Wqkv.weight', 'bert.encoder.layer.7.mlp.gated_layers.weight', 'bert.encoder.layer.10.mlp.wo.bias', 'bert.encoder.layer.11.mlp.gated_layers.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.0.mlp.wo

Downloading (…)okenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

Model Configuration:
Vocabulary size: 4096
Hidden size: 768
Number of hidden layers: 12
Number of attention heads: 12
Intermediate size: 3072
Hidden activation function: gelu
Attention dropout probability: 0.0
Hidden dropout probability: 0.1
Token type vocabulary size: 2
Layer normalization epsilon: 1e-12
Padding token ID: 0
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4096, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features

In [5]:
# NT Model Configs
# NT 500m human ref
  # Load the model and tokenizer
model = AutoModel.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref were not used when initializing EsmModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['esm.pooler.dense.weight', 'esm.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be ab

Downloading (…)okenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Model Configuration:
Vocabulary size: 4105
Hidden size: 1280
Number of hidden layers: 24
Number of attention heads: 20
Intermediate size: 5120
Attention dropout probability: 0.0
Hidden dropout probability: 0.0
Layer normalization epsilon: 1e-12
Padding token ID: 1
EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(4105, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1002, 1280, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-23): 24 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280,

In [6]:
# NT 500M_1000G
  # Load the model and tokenizer
model = AutoModel.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-1000g were not used when initializing EsmModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-1000g and are newly initialized: ['esm.pooler.dense.weight', 'esm.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to us

Downloading (…)okenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Model Configuration:
Vocabulary size: 4105
Hidden size: 1280
Number of hidden layers: 24
Number of attention heads: 20
Intermediate size: 5120
Attention dropout probability: 0.0
Hidden dropout probability: 0.0
Layer normalization epsilon: 1e-12
Padding token ID: 1
EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(4105, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1002, 1280, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-23): 24 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280,

In [None]:
# NT 2B5_1000G
  # Load the model and tokenizer
model = AutoModel.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-1000g")
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-1000g")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
#print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
#print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-1000g were not used when initializing EsmModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-1000g and are newly initialized: ['esm.pooler.dense.weight', 'esm.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to us

Model Configuration:
Vocabulary size: 4105
Hidden size: 2560
Number of hidden layers: 32
Number of attention heads: 20
Intermediate size: 10240
Attention dropout probability: 0.0
Hidden dropout probability: 0.0
Layer normalization epsilon: 1e-12
Padding token ID: 1
EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(4105, 2560, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1002, 2560, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-31): 32 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=2560, out_features=2560, bias=True)
            (key): Linear(in_features=2560, out_features=2560, bias=True)
            (value): Linear(in_features=2560, out_features=2560, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=2560

In [None]:
# NT 2B5_multi_species
  # Load the model and tokenizer
model = AutoModel.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-2.5b-multi-species")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
#print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
#print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-multi-species were not used when initializing EsmModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing EsmModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-multi-species and are newly initialized: ['esm.pooler.dense.weight', 'esm.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task 

Model Configuration:
Vocabulary size: 4105
Hidden size: 2560
Number of hidden layers: 32
Number of attention heads: 20
Intermediate size: 10240
Attention dropout probability: 0.0
Hidden dropout probability: 0.0
Layer normalization epsilon: 1e-12
Padding token ID: 1
EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(4105, 2560, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
    (position_embeddings): Embedding(1002, 2560, padding_idx=1)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-31): 32 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=2560, out_features=2560, bias=True)
            (key): Linear(in_features=2560, out_features=2560, bias=True)
            (value): Linear(in_features=2560, out_features=2560, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=2560

In [None]:
# roBERTa
  # Load the model and tokenizer
model = AutoModel.from_pretrained("xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Model Configuration:
Vocabulary size: 250002
Hidden size: 768
Number of hidden layers: 12
Number of attention heads: 12
Intermediate size: 3072
Hidden activation function: gelu
Attention dropout probability: 0.1
Hidden dropout probability: 0.1
Token type vocabulary size: 1
Layer normalization epsilon: 1e-05
Padding token ID: 1
XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
  

In [None]:
# T5
  # Load the model and tokenizer
model = AutoModel.from_pretrained("google/flan-t5-xxl")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
  # Print information from model.config
print("Model Configuration:")
print("Vocabulary size:", model.config.vocab_size)
print("Hidden size:", model.config.hidden_size)
print("Number of hidden layers:", model.config.num_hidden_layers)
print("Number of attention heads:", model.config.num_attention_heads)
print("Intermediate size:", model.config.intermediate_size)
print("Hidden activation function:", model.config.hidden_act)
print("Attention dropout probability:", model.config.attention_probs_dropout_prob)
print("Hidden dropout probability:", model.config.hidden_dropout_prob)
print("Token type vocabulary size:", model.config.type_vocab_size)
print("Layer normalization epsilon:", model.config.layer_norm_eps)
print("Padding token ID:", model.config.pad_token_id)
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]