<a href="https://colab.research.google.com/github/ainesko/NLP-HW-Embeddings-and-Embedding-Atlas/blob/main/embed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading the model

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-S", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNABERT-S", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
#!pip uninstall triton

Found existing installation: triton 3.4.0
Uninstalling triton-3.4.0:
  Would remove:
    /usr/local/bin/proton
    /usr/local/bin/proton-viewer
    /usr/local/lib/python3.12/dist-packages/triton-3.4.0.dist-info/*
    /usr/local/lib/python3.12/dist-packages/triton/*
Proceed (Y/n)? Y
  Successfully uninstalled triton-3.4.0


In [2]:
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"]
hidden_states = model(inputs)[0] # [1, sequence_length, 768]

# embedding with mean pooling
embedding_mean = torch.mean(hidden_states[0], dim=0)
print(embedding_mean.shape) # expect to be 768

torch.Size([768])


#Data

In [3]:
!gdown 1I44T2alXrtXPZrhkuca6QP3tFHxDW98c
!unzip dnabert-s_eval.zip

Downloading...
From (original): https://drive.google.com/uc?id=1I44T2alXrtXPZrhkuca6QP3tFHxDW98c
From (redirected): https://drive.google.com/uc?id=1I44T2alXrtXPZrhkuca6QP3tFHxDW98c&confirm=t&uuid=dce35cdd-6716-4ef1-900e-6f77d2c73500
To: /content/dnabert-s_eval.zip
100% 1.69G/1.69G [00:32<00:00, 52.5MB/s]
Archive:  dnabert-s_eval.zip
   creating: reference/
  inflating: reference/eval_species_count_0.json  
  inflating: reference/binning_6.tsv  
  inflating: reference/clustering_0.tsv  
  inflating: reference/eval_species_count_1.json  
  inflating: reference/clustering_1.tsv  
  inflating: reference/binning_5.tsv  
   creating: plant/
  inflating: plant/clustering_4.tsv  
  inflating: plant/binning_6.tsv     
  inflating: plant/clustering_2.tsv  
  inflating: plant/clustering_0.tsv  
  inflating: plant/clustering_1.tsv  
  inflating: plant/clustering_3.tsv  
  inflating: plant/binning_5.tsv     
   creating: marine/
  inflating: marine/clustering_4.tsv  
  inflating: marine/binning_6.t

In [5]:
import pandas as pd

df_sample = pd.read_csv("marine/clustering_4.tsv", sep="\t")
#df_sample = df_sample.sample(n=1000, random_state=42).reset_index(drop=True) #here we can choose a random subset of size n
df_sample = df_sample.rename(columns={'bin_id': 'label'})
df_sample['id'] = range(len(df_sample))
df_sample.head()

Unnamed: 0,sequence,label,id
0,CTTTAGATTTTGGAGTTTATTCTTTTGATAAAATAAATTATACCGC...,Otu911.0,0
1,GTCCCCAAAAAAGATAAAAAAGACGACTATTATGATCGTTTTCGAG...,Otu767,1
2,AGTAATTACAACGTAGCTTTAATCACAAAAGATAAAGGCTTGTCCT...,Otu404.0,2
3,TACAACTTCATCTTCAGAAACATCTCTTGAAAGTCCAAATGCTATA...,Otu1080,3
4,ATGAATTGGGAACGCTAAGTATATATGAGGATGATAGGGAAATAGG...,Otu1446.0,4


#Computing embeddings

In [6]:
device = torch.device("cuda")
model = model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(4096, 768, padding_idx=0)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertUnpadAttention(
          (self): BertUnpadSelfAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (Wqkv): Linear(in_features=768, out_features=2304, bias=True)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (mlp): BertGatedLinearUnitMLP(
          (gated_layers): Linear(in_features=768, out_features=6144, bias=False)
          (act): GELU(approximate='none')
  

In [7]:
from tqdm import tqdm

embeddings_gpu = []
for seq in tqdm(df_sample["sequence"], desc="Computing embeddings"):
    inputs = tokenizer(seq, return_tensors='pt')["input_ids"].to(device)
    with torch.no_grad():
        hidden_states = model(inputs)[0]
    embedding = torch.mean(hidden_states[0], dim=0)
    embeddings_gpu.append(embedding)

Computing embeddings: 100%|██████████| 1000/1000 [04:18<00:00,  3.87it/s]


In [8]:
embeddings = [embed.cpu().numpy() for embed in embeddings_gpu]

In [9]:
out = pd.DataFrame({
    "id": df_sample["id"],
    "sequence": df_sample["sequence"],
    "label": df_sample["label"],
    "embedding": embeddings
})

#Adding PCA-projections

In [10]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=2)
projection = pca.fit_transform(np.vstack(embeddings))
projection_x, projection_y = projection[:, 0], projection[:, 1]

out["projection_x"] = projection_x
out["projection_y"] = projection_y

#Downloading full parquet

In [12]:
out.to_parquet("embeddings_full.parquet", index=False)

In [13]:
from google.colab import files
files.download("embeddings_full.parquet")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#Dowloading first 1000 samples

In [None]:
out_loaded_1000 = out.head(1000)

out_loaded_1000.to_parquet("embeddings_1000.parquet", index=False)
files.download("embeddings_1000.parquet")