In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import OneHotEncoder
import pickle
import os

from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

In [2]:
# Load Instadeep's model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-100m-multi-species", trust_remote_code=True)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 120359c5-ac0e-4b20-96ee-a950cf7cd532)')' thrown while requesting HEAD https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species/resolve/main/tokenizer_config.json
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: da9d9263-8582-4dfa-a7e1-63f47621bbf5)')' thrown while requesting HEAD https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species/resolve/main/config.json
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: fd158713-25a9-4269-8a0d-950b6ee8f640)')' thrown while requesting HEAD https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-100m-multi-species/resolve/main/esm_config.py
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out

In [3]:
max_length = 5800 // 6
print(max_length)

966


In [4]:
# Load dataset

def filter_to_len(df, max_len):
    df['length'] = df['coding'].apply(len)
    return df[df['length'] <= max_len]

train_df = pd.read_csv('data/icodon/training.csv.gz', index_col=0, compression='gzip')
test_df = pd.read_csv('data/icodon/testing.csv.gz', index_col=0, compression='gzip')

train_df = filter_to_len(train_df, max_length * 6 - 6)
test_df = filter_to_len(test_df, max_length * 6 - 6)

n_samples = 30000
train_df = train_df[:n_samples]
test_df = test_df[:1000]

In [5]:
def get_tokens(df, tokenizer):
    sequences = df['coding'].values.tolist()
    token_ids = tokenizer.batch_encode_plus(sequences,
                                            return_tensors='pt',
                                            padding='max_length',
                                            max_length=max_length)['input_ids']
    return token_ids

In [6]:
train_df['length'].describe()

count    30000.000000
mean      1622.985300
std       1080.348414
min          3.000000
25%        840.000000
50%       1347.000000
75%       2115.000000
max       5790.000000
Name: length, dtype: float64

In [7]:
tokens = get_tokens(train_df, tokenizer)

In [8]:
device = torch.device('cuda')
model = model.to(device)
tokens = tokens.to(device='cuda')

In [9]:
def get_embeddings(tokens, model, batch_size=16):
    res = []
    for i in tqdm(range(0, tokens.shape[0], batch_size)):
        batch = tokens[i:min(i+batch_size, tokens.shape[0])]
        attention_mask = batch != tokenizer.pad_token_id
        embeddings = model(
            batch,
            attention_mask=attention_mask,
            encoder_attention_mask=attention_mask,
            output_hidden_states=True
        )['hidden_states'][-1].detach().cpu().numpy()
        res.append(embeddings)
    return np.concatenate(res)

In [10]:
embeddings = get_embeddings(tokens, model, batch_size=4)

100%|███████████████████████████████████████████████████████████| 7500/7500 [13:23<00:00,  9.33it/s]


In [11]:
torch.cuda.empty_cache()

In [12]:
print(embeddings.shape)

(30000, 966, 512)


In [13]:
def pool(embeddings, lengths):
    pooled = np.zeros((embeddings.shape[0], embeddings.shape[2]))
    for i in range(embeddings.shape[0]):
        pooled[i] = np.mean(embeddings[i, :lengths[i], :], axis=0)
    return pooled

In [14]:
# Species one hot encoder
if not os.path.isfile('species_encoder.pkl'):
    enc = OneHotEncoder().fit(train_df['specie'].values.reshape((-1, 1)))
    with open('species_encoder.pkl', 'wb') as pkl:
        pickle.dump(enc, pkl)
else:
    with open('species_encoder.pkl', 'rb') as pkl:
        enc = pickle.load(pkl)

def add_species(X, df, enc):
    X_spec = enc.transform(df['specie'].values.reshape((-1, 1))).toarray()
    return np.concatenate([X, X_spec], axis=1)

In [15]:
X = pool(embeddings, train_df['length'].values)
X = add_species(X, train_df, enc)
y = train_df['decay_rate'].values

clf = LassoCV(alphas=[0.001, 0.01, 0.1, 1])
clf.fit(X, y)
print(clf.alpha_)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


0.001


  model = cd_fast.enet_coordinate_descent(


In [16]:
# Testing
tokens_test = get_tokens(test_df, tokenizer)
tokens_test = tokens_test.to(device=device)
emb_test = get_embeddings(tokens_test, model, batch_size=4)
print(emb_test.shape)

100%|█████████████████████████████████████████████████████████████| 250/250 [00:25<00:00,  9.68it/s]


(1000, 966, 512)


In [17]:
torch.cuda.empty_cache()

In [18]:
X_test = pool(emb_test, test_df['length'].values)
X_test = add_species(X_test, test_df, enc)
y_test = test_df['decay_rate'].values

clf.score(X_test, y_test)

0.11648287184721917

In [19]:
with open(f'lasso_{n_samples}.pkl', 'wb') as pkl:
    pickle.dump(clf, pkl)