In [1]:
import pandas as pd

attributes = pd.read_csv("data.csv")
attributes.head()

Unnamed: 0,Analysis,Attribute
0,D_250475,IL2 INHIBITION ASSAY
1,D_95007196,PH
2,D_M00003744,ABATACEPT MAJOR BAND (REDUCED)
3,Y_SM_95011468_R,BIOASSAY
4,250684_CE_SDS_REDUC,SUM HEAVY AND LIGHT CHAIN


In [2]:
print("Before dropping NAs:", attributes.shape)
attributes.dropna(inplace=True, ignore_index=True)
print("After dropping NAs:", attributes.shape)

Before dropping NAs: (7730, 2)
After dropping NAs: (7726, 2)


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [4]:
for idx, attr in attributes.iterrows():
    attributes.at[idx, "Analysis_tokens"] = " ".join(tokenizer.tokenize(attr["Analysis"]))
    attributes.at[idx, "Attribute_tokens"] = " ".join(tokenizer.tokenize(attr["Attribute"]))

In [5]:
attributes = attributes[["Analysis", "Analysis_tokens", "Attribute", "Attribute_tokens"]]
attributes.head()

Unnamed: 0,Analysis,Analysis_tokens,Attribute,Attribute_tokens
0,D_250475,d _ 250 ##47 ##5,IL2 INHIBITION ASSAY,il ##2 inhibition ass ##ay
1,D_95007196,d _ 950 ##0 ##7 ##19 ##6,PH,ph
2,D_M00003744,d _ m ##00 ##00 ##37 ##44,ABATACEPT MAJOR BAND (REDUCED),aba ##ta ##ce ##pt major band ( reduced )
3,Y_SM_95011468_R,y _ sm _ 950 ##11 ##46 ##8 _ r,BIOASSAY,bio ##ass ##ay
4,250684_CE_SDS_REDUC,250 ##6 ##8 ##4 _ ce _ sd ##s _ red ##uc,SUM HEAVY AND LIGHT CHAIN,sum heavy and light chain


In [6]:
encodings = tokenizer(attributes["Analysis"].tolist(), attributes["Attribute"].tolist(), padding=True, truncation=True, return_tensors='pt')

In [7]:
encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [8]:
import torch
from transformers import AutoModel

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModel.from_pretrained('distilbert-base-uncased')
model.to('cuda')
model.eval()

with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encodings)

token_embeddings = outputs.last_hidden_state


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [12]:
token_embeddings.shape

torch.Size([7726, 37, 768])

In [9]:
# Using CLS token
cls_embeddings = token_embeddings[:, 0, :]

In [17]:
# Mean pooling (average over the token embeddings excluding padding)
attention_mask = encodings['attention_mask']  # To exclude padding tokens from the mean
mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
mean_embeddings = sum_embeddings / sum_mask

In [18]:
from sklearn.cluster import KMeans

embeddings = mean_embeddings.numpy()

# Perform K-Means clustering
kmeans = KMeans(n_clusters=32, random_state=42)
kmeans.fit(embeddings)

# Get the cluster labels for each input
cluster_labels = kmeans.predict(embeddings)
print("Cluster labels:", cluster_labels)


Cluster labels: [26 10  2 ...  9  1 13]


In [19]:
attributes["Cluster"] = cluster_labels
attributes.head()

Unnamed: 0,Analysis,Analysis_tokens,Attribute,Attribute_tokens,Cluster
0,D_250475,d _ 250 ##47 ##5,IL2 INHIBITION ASSAY,il ##2 inhibition ass ##ay,26
1,D_95007196,d _ 950 ##0 ##7 ##19 ##6,PH,ph,10
2,D_M00003744,d _ m ##00 ##00 ##37 ##44,ABATACEPT MAJOR BAND (REDUCED),aba ##ta ##ce ##pt major band ( reduced ),2
3,Y_SM_95011468_R,y _ sm _ 950 ##11 ##46 ##8 _ r,BIOASSAY,bio ##ass ##ay,2
4,250684_CE_SDS_REDUC,250 ##6 ##8 ##4 _ ce _ sd ##s _ red ##uc,SUM HEAVY AND LIGHT CHAIN,sum heavy and light chain,2


In [20]:
attributes.sort_values(by=['Cluster'], inplace=True, ignore_index=True)
# attributes.to_csv("cluster_mean_embed.csv")
attributes.to_csv("cluster_cls_token.csv")