In [None]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import pandas as pd
from Bio import SeqIO

In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 

In [None]:
def load_sequences(directory, label):
    data = []
    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        if os.path.isdir(filepath) or file.startswith("."):
            continue  # Skip directories and hidden files
        try:
            for record in SeqIO.parse(filepath, "fasta"):
                data.append({
                    'sequence': str(record.seq),
                    'label': label
                })
        except Exception as e:
            print(f"Skipping {file}: {e}")
    return data

In [None]:
positive_data = load_sequences('/content/drive/MyDrive/crabtree_positive', 1)
negative_data = load_sequences('/content/drive/MyDrive/crabtree_negative/crabtree_negative', 0)

df = pd.DataFrame(positive_data + negative_data)
shuffled_df = df.sample(frac=1,random_state=42).reset_index(drop=True)
print(shuffled_df.head)

<bound method NDFrame.head of                                              sequence  label
0   AGATAAGATTACCCCCCCCCCGCACCGCAGGTGCGGGGTATGTTAT...      0
1   ATATTATTAATAATGTAAGATTACTTATTAAATATACATTTAATTA...      0
2   TATATATTAATAATAATTAATATATAGTAAAAAAACTAAAATTACT...      1
3   TGAATCCCATTAAGAAGAAGTATTATATTAAAATTAATTCTCGCCA...      0
4   GAATTCACTCTTATAGAGTTTCATTCGAATATGGTAGGGTTTACTC...      1
..                                                ...    ...
59  TATACTTATATAATAAGGTGATATATTGAAAAAAGGTATATGAAAA...      0
60  CTGAGTACGCATCCATCCTCTTCCTTAGTACTCTAACCGCTATCCT...      1
61  TATAATAATAAGTAAACCAGTGATAAAGAGAATTTATCAAAGTTAA...      1
62  CTCTTATAAATTTTTTCTTATTTTATCTTTATTACATTACATTACA...      0
63  GCTTGTATAGTTTAAAGGTTAAAACATTTGTCTCATAAATAAATAA...      0

[64 rows x 2 columns]>


In [None]:
shuffled_df.to_csv('output.csv', index=False)

In [None]:
sequences = shuffled_df['sequence'][2]

In [None]:
type(sequences)

str

In [None]:
!pip install matplotlib



In [None]:
import os

try:
    import nucleotide_transformer
except:
    !pip install numpy==1.23.5
    !pip install git+https://github.com/instadeepai/nucleotide-transformer@main |tail -n 1
    import nucleotide_transformer

if "COLAB_TPU_ADDR" in os.environ:
    from jax.tools import colab_tpu

    colab_tpu.setup_tpu()

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
arviz 0.22.0 requires numpy>=1.26.0, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
opencv-python 4.12.0.88 requires num

  Running command git clone --filter=blob:none --quiet https://github.com/instadeepai/nucleotide-transformer /tmp/pip-req-build-dniixqps
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
google-genai 1.27.0 requires pydantic<3.0.0,>=2.0.0, but you have pydantic 1.10.13 which is incompatible.
langchain-core 0.3.72 requires pydantic>=2.7.4, but you have pydantic 1.10.13 which is incompatible.
langchain 0.3.27 requires pydantic<3.0.0,>=2.7.4, but you have pydantic 1.10.13 which i

In [None]:
import haiku as hk
import jax
import jax.numpy as jnp
from nucleotide_transformer.pretrained import get_pretrained_model

In [None]:
def chunk_genome(genome_sequence, chunk_size=300):
    """
    Splits a genome string into non-overlapping chunks.

    Args:
        genome_sequence (str): The full DNA sequence as a string.
        chunk_size (int): The desired length of each chunk.

    Returns:
        List[str]: List of sequence chunks.
    """
    return [
        genome_sequence[i:i+chunk_size]
        for i in range(0, len(genome_sequence), chunk_size)
        if len(genome_sequence[i:i+chunk_size]) == chunk_size
    ]



In [None]:
chunk=chunk_genome(sequences,chunk_size=300)
print(f"total chunks:{len(chunk)}")


total chunks:75


In [None]:
# Select a model
model_name = '50M_multi_species_v2'

In [None]:
# Get pretrained model
parameters, forward_fn, tokenizer, config = get_pretrained_model(
    model_name=model_name,
    embeddings_layers_to_save=(2,),
    attention_maps_to_save=((1, 4), (7, 16)),
    max_positions=300,
)
forward_fn = hk.transform(forward_fn)

Downloading model's hyperparameters json file...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


hyperparams.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloaded model's hyperparameters.
Downloading model's weights...


pytree_ckpt.joblib:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloaded model's weights...


In [None]:
import numpy as np
sequences = chunk[0]
tokens_ids = [b[1] for b in tokenizer.batch_tokenize(sequences)]
tokens_str = [b[0] for b in tokenizer.batch_tokenize(sequences)]
tokens = jnp.asarray(tokens_ids, dtype=jnp.int32)

# Initialize random key
random_key = jax.random.PRNGKey(0)

# Infer
outs = forward_fn.apply(parameters, random_key, tokens)
embeddings = outs["embeddings_12"][:, 1:, :]  # removing CLS token
padding_mask = jnp.expand_dims(tokens[:, 1:] != tokenizer.pad_token_id, axis=-1)
masked_embeddings = embeddings * padding_mask  # multiply by 0 pad tokens embeddings
sequences_lengths = jnp.sum(padding_mask, axis=1)
mean_embeddings = jnp.sum(masked_embeddings, axis=1) / sequences_lengths
print(mean_embeddings)
np_array = np.array(mean_embeddings)
np.save(f'mean_embeddings{i}.npy', np_array)


KeyboardInterrupt: 

In [None]:
embeddings = outs["embeddings_12"][:, 1:, :]  # removing CLS token
padding_mask = jnp.expand_dims(tokens[:, 1:] != tokenizer.pad_token_id, axis=-1)
masked_embeddings = embeddings * padding_mask  # multiply by 0 pad tokens embeddings
sequences_lengths = jnp.sum(padding_mask, axis=1)
mean_embeddings = jnp.sum(masked_embeddings, axis=1) / sequences_length
np_array = np.array(mean_embeddings)


In [None]:
print(mean_embeddings.shape)

(300, 512)


In [None]:
print(mean_embeddings)
np_array = np.array(mean_embeddings)


[[-0.5250633  -0.23216476 -0.04807672 ...  0.09126548  0.7886499
  -0.24323574]
 [-0.5250633  -0.23216476 -0.04807672 ...  0.09126548  0.7886499
  -0.24323574]
 [-0.5250633  -0.23216476 -0.04807672 ...  0.09126548  0.7886499
  -0.24323574]
 ...
 [-0.26185364 -0.53783983 -0.22837885 ... -0.07485802  0.729882
  -0.28649193]
 [-0.5250633  -0.23216476 -0.04807672 ...  0.09126548  0.7886499
  -0.24323574]
 [-0.09502478 -0.03200917  0.04934499 ...  0.3012107   0.8727122
  -0.12038998]]


In [None]:
import numpy as np
np.save('mean_embeddings.npy', np_array)

In [None]:
print(outs["attention_map_layer_1_number_4"].shape)

(300, 300, 300)


In [None]:
import os
import numpy as np
import jax
import jax.numpy as jnp

# make sure output dir exists
os.makedirs("embeddings", exist_ok=True)

for i in range(26,75):
    # adjust this line to how your data is actually structured
    sequences = chunk[i]  # or however you index into each chunk

    # tokenize once
    tokenized = tokenizer.batch_tokenize(sequences)
    tokens_ids = [b[1] for b in tokenized]
    tokens_str = [b[0] for b in tokenized]  # if you need the string form

    tokens = jnp.asarray(tokens_ids, dtype=jnp.int32)

    # infer
    random_key = jax.random.PRNGKey(0)  # or vary seed if desired
    outs = forward_fn.apply(parameters, random_key, tokens)

    # get embeddings (layer 12 here) and drop CLS
    embeddings = outs["embeddings_2"][:, 1:, :]  # (batch, seq_len-1, hidden_dim)

    # build padding mask to ignore pad token embeddings
    padding_mask = jnp.expand_dims(tokens[:, 1:] != tokenizer.pad_token_id, axis=-1)  # (batch, seq_len-1, 1)

    masked_embeddings = embeddings * padding_mask  # zero out padding embeddings

    # sum over tokens and divide by actual length (mean pooling)
    sequences_lengths = jnp.sum(padding_mask, axis=1)  # (batch, 1)
    mean_embeddings = jnp.sum(masked_embeddings, axis=1) / jnp.clip(sequences_lengths, a_min=1e-6)  # (batch, hidden_dim)

    # move to numpy and save
    np_array = np.array(mean_embeddings)  # shape: (batch, hidden_dim)
    out_path = f"/content/drive/MyDrive/embedding_3/mean_embeddings_{i}.npy"
    np.save(out_path, np_array)
    print(f"Saved embeddings for index {i} to {out_path}")


Saved embeddings for index 0 to /content/drive/MyDrive/embedding_3/mean_embeddings_0.npy
Saved embeddings for index 1 to /content/drive/MyDrive/embedding_3/mean_embeddings_1.npy
Saved embeddings for index 2 to /content/drive/MyDrive/embedding_3/mean_embeddings_2.npy
Saved embeddings for index 3 to /content/drive/MyDrive/embedding_3/mean_embeddings_3.npy
Saved embeddings for index 4 to /content/drive/MyDrive/embedding_3/mean_embeddings_4.npy
Saved embeddings for index 5 to /content/drive/MyDrive/embedding_3/mean_embeddings_5.npy
Saved embeddings for index 6 to /content/drive/MyDrive/embedding_3/mean_embeddings_6.npy
Saved embeddings for index 7 to /content/drive/MyDrive/embedding_3/mean_embeddings_7.npy
Saved embeddings for index 8 to /content/drive/MyDrive/embedding_3/mean_embeddings_8.npy
Saved embeddings for index 9 to /content/drive/MyDrive/embedding_3/mean_embeddings_9.npy
Saved embeddings for index 10 to /content/drive/MyDrive/embedding_3/mean_embeddings_10.npy
Saved embeddings fo

In [None]:
loaded_np=np.load('/content/embeddings/mean_embeddings_0.npy')

In [None]:
loaded_np.shape

(1, 512)

In [None]:
print(loaded_np)