In [1]:
import pandas as pd
import torch
from transformers import EsmModel, EsmTokenizer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap.umap_ as umap  # Correct import for UMAP
from sklearn.manifold import TSNE
import numpy as np
from scipy.interpolate import griddata
from scipy.ndimage import gaussian_filter
from scipy.spatial import cKDTree

features = ['interface_score', 'total_score', 'efield_score','generation']
invert   = ['interface_score', 'total_score']

# Define the dataset path and output folder
dataset = 'all_scores_negcntr'
esm2_model = "facebook/esm2_t6_8M_UR50D"
# esm2_model = "facebook/esm2_t33_650M_UR50D"
# esm2_model = "facebook/esm2_t36_3B_UR50D"
# esm2_model = "facebook/esm2_t48_15B_UR50D"

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Read the dataset
df = pd.read_csv(f'{dataset}.csv')
df = df#[9000:]
df = df[df['sequence'].notnull()]
print(f'### Analyzing {len(df)} datapoints. ###')

for feature in features:
    if feature not in invert: continue
    df[feature] = -df[feature]

sequences = list(dict.fromkeys(df['sequence']))
print(f'### Dataset contains {len(sequences)} unique seuqences. ###')

# Tokenize dataset
tokenizer = EsmTokenizer.from_pretrained(esm2_model)
tokenized_sequence = [tokenizer(sequence, return_tensors='pt') for sequence in sequences]

# ESM embedding
model = EsmModel.from_pretrained(esm2_model).to(device)
model.eval()
def get_last_hidden_states(model, tokenized_sequences):
    with torch.no_grad():
        all_hidden_states = []
        for seq in tokenized_sequences:
            seq = {k: v.to(device) for k, v in seq.items()}
            output = model(**seq)
            hidden_states = output.last_hidden_state.squeeze(0).mean(dim=0).cpu().numpy()
            all_hidden_states.append(hidden_states)
        return np.array(all_hidden_states)
all_hidden_states = get_last_hidden_states(model, tokenized_sequence)
print("### Embedding done ###")

# Dimensionality reduction
umap_model = umap.UMAP()  
tsne = TSNE(n_components=2)
pca = PCA(n_components=2)

umap_df = pd.DataFrame(umap_model.fit_transform(all_hidden_states), columns=['umap_x', 'umap_y'], index=sequences)
tsne_df = pd.DataFrame(tsne.fit_transform(all_hidden_states), columns=['tsne_x', 'tsne_y'], index=sequences)
pca_df = pd.DataFrame(pca.fit_transform(all_hidden_states), columns=['pca_x', 'pca_y'], index=sequences)

umap_df['sequence'] = sequences
tsne_df['sequence'] = sequences
pca_df['sequence'] = sequences

# Merge these dataframes back with the original dataframe
df = pd.merge(df, umap_df, on='sequence', how='left')
df = pd.merge(df, tsne_df, on='sequence', how='left')
df = pd.merge(df, pca_df, on='sequence', how='left')

df.to_csv(f'{dataset}_embedded.csv', index=False)

In [2]:
!pip install importlib-metadata



In [4]:
from importlib.metadata import version, PackageNotFoundError
