In [None]:
#Start

#### Code Summmary

The goal is to find similarities between sentences containing multiple RSIDs, in order to identify potential relationships between those RSIDs. We starts by loading a subset of tokenized RSID sentences data and the fine-tuned PubMedBERT model. The RSID tokens are also loaded.

Sentences containing multiple RSIDs are extracted from the subset. The model generates an embedding for each RSID in these sentences by masking the RSID token and passing it through the model. The embeddings for the different RSIDs within a sentence are aggregated by taking the average. This gives a single sentence embedding capturing information about all the RSIDs present.

Cosine similarity is then calculated between these sentence embeddings and all other sentence embeddings in the corpus. Highly similar sentence pairs likely indicate a relationship between the RSIDs in those sentences. A threshold is set for the cosine similarity scores to only keep highly similar pairs. The similarities are sorted and the top most similar pairs for each sentence are kept.

The similarities and sentence embeddings are saved to files. T

In summary, the key steps were:

- Load subset data and model
- Extract sentences with multiple RSIDs
- Generate embeddings for each RSID
- Aggregate embeddings into sentence embeddings
- Calculate cosine similarities
- Filter using a similarity threshold
- Save similarities and embeddings

This allows mining the corpus for potential RSID relationships in an unsupervised way by leveraging the pretrained language model. The output data can then be analyzed to surface meaningful relationships.

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!cp "/content/gdrive/My Drive/Ver S/New/tokenized_rsid_sentences.pkl" "./"
!cp -r "/content/gdrive/My Drive/Ver S/New/trained_model" "./"
!cp -r "/content/gdrive/My Drive/Ver S/New/trained_tokenizer" "./"
!cp "/content/gdrive/My Drive/Ver S/rsid_tokens.pkl" "./"

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m114.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
Co

In [None]:
import pickle
import random
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizerFast, BertModel, BertTokenizer
import torch
import numpy as np
from tqdm import tqdm

In [None]:
# Load tokenized_data
with open("./tokenized_rsid_sentences.pkl", 'rb') as f:
    tokenized_data = pickle.load(f)

In [None]:
# Randomly sample 1% of the data
num_samples = int(0.01 * len(tokenized_data))
all_pmids = list(tokenized_data.keys())
selected_pmids = random.sample(all_pmids, num_samples)

# Create a dictionary with the selected data
sampled_data = {pmid: tokenized_data[pmid] for pmid in selected_pmids}

In [None]:
with open("./sampled_data.pkl", 'wb') as f:
    pickle.dump(sampled_data, f)

print(f"Sampled {len(sampled_data)} pmid-sentences from the full corpus.")

Sampled 1076 pmid-sentences from the full corpus.


In [None]:
# Load the fine-tuned model and tokenizer
MODEL_PATH = "./trained_model"
TOKENIZER_PATH = "./trained_tokenizer"

In [None]:
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)
model = BertModel.from_pretrained(MODEL_PATH)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of BertModel were not initialized from the model checkpoint at ./trained_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(189044, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Load RSID tokens and update tokenizer and model
with open('./rsid_tokens.pkl', 'rb') as f:
    rsid_tokens = pickle.load(f)

rsid_token_ids = tokenizer.encode(rsid_tokens, add_special_tokens=False)

In [None]:
print(type(tokenizer))
print(dir(tokenizer))

<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>


In [None]:
# Extract sentences with multiple RSIDs
sentences_with_multiple_rsids = []
rsid_token_ids_set = set(rsid_token_ids)

for pmid, values in tqdm(sampled_data.items(), desc="Extracting sentences with multiple RSIDs"):
    for input_ids in values["input_ids"]:
        rsid_count = len(set(input_ids) & rsid_token_ids_set)
        if rsid_count > 1:
            sentences_with_multiple_rsids.append((pmid, input_ids))

Extracting sentences with multiple RSIDs: 100%|██████████| 1076/1076 [00:00<00:00, 40386.15it/s]


In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

95

In [None]:
MASK_TOKEN = tokenizer.mask_token_id

In [None]:
model.to('cuda')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(189044, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
!nvidia-smi

Sun Oct 22 15:04:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    32W /  70W |  10283MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Step 1: Calculate the embeddings for all the RSIDs within each sentence
sentence_embeddings = {}

for pmid, input_ids in tqdm(sentences_with_multiple_rsids, desc="Generating embeddings"):
    original_sentence = tokenizer.decode(input_ids, skip_special_tokens=True)
    embeddings = []

    input_ids_tensor = torch.tensor(input_ids).to('cuda')  # Convert to tensor
    attention_mask_tensor = (input_ids_tensor != tokenizer.pad_token_id).long()  # Create attention mask

    for rsid_id in rsid_token_ids:
        if rsid_id in input_ids_tensor:
            masked_input_ids = input_ids_tensor.clone()
            masked_input_ids[input_ids_tensor == rsid_id] = MASK_TOKEN
            with torch.no_grad():
                outputs = model(masked_input_ids.unsqueeze(0), attention_mask=attention_mask_tensor.unsqueeze(0))
            embeddings.append(outputs.last_hidden_state[0, masked_input_ids == MASK_TOKEN].mean(dim=0).cpu().numpy())

    # Step 2: Aggregate the embeddings (average them)
    if embeddings:
        average_embedding = np.mean(embeddings, axis=0)
        sentence_embeddings[pmid] = average_embedding

Generating embeddings: 100%|██████████| 571/571 [53:26<00:00,  5.62s/it]


In [None]:
# Save the sentence embeddings to a file
import pickle

with open("sentence_embeddings.pkl", "wb") as f:
    pickle.dump(sentence_embeddings, f)

In [None]:
# Step 3: Calculate cosine similarities with the embeddings from the rest of the corpus
similarities = {}

# Set the similarity threshold
similarity_threshold = 0.98

for pmid, embedding in tqdm(sentence_embeddings.items(), desc="Calculating similarities"):
    similarities[pmid] = {}
    for other_pmid, other_embedding in sentence_embeddings.items():
        if pmid != other_pmid:
            similarity = cosine_similarity([embedding], [other_embedding])[0][0]
            if similarity >= similarity_threshold:
                similarities[pmid][other_pmid] = similarity

Calculating similarities: 100%|██████████| 387/387 [00:35<00:00, 10.99it/s]


In [None]:
import json
import numpy as np

# Convert float32 values to float
similarities_float = {
    pmid: {pmid2: float(sim) for pmid2, sim in pmid_sim.items()}
    for pmid, pmid_sim in similarities.items()
}

# Store results in a JSON file
with open('similarities.json', 'w') as f:
    json.dump(similarities_float, f)

In [None]:
# Sort the similarities in descending order
sorted_similarities = {pmid: sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)[:5] for pmid, similarity_dict in similarities_float.items()}

# Print a sample of results
sample_pmid = list(sorted_similarities.keys())[0]
sample_results = {sample_pmid: sorted_similarities[sample_pmid]}

print(json.dumps(sample_results, indent=4))

{
    "36082566": [
        [
            "21304977",
            0.995354175567627
        ],
        [
            "35226426",
            0.9947450757026672
        ],
        [
            "11883940",
            0.9939204454421997
        ],
        [
            "12566567",
            0.993152916431427
        ],
        [
            "34482844",
            0.9931391477584839
        ]
    ]
}


In [None]:
!cp "./sampled_data.pkl" "/content/gdrive/My Drive/Ver S/New/"
!cp "./similarities.json" "/content/gdrive/My Drive/Ver S/New/"
!cp "./sentence_embeddings.pkl"  "/content/gdrive/My Drive/Ver S/New/"

In [None]:
# Store to SQL db

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


Method.


1. Load `similarities.json` and `rsid_sentences.json` from the local directory.
2. Extract sentences with multiple RSIDs.
3. Cross-check the top RSIDs and RSID pairs with the similarity scores.
4. Store the structured data (similarity scores associated with the top RSIDs and RSID pairs) in a database (in this example, we'll use SQLite for simplicity).



In [10]:
import json
import re
from collections import Counter
from itertools import combinations
import sqlite3

In [22]:
with open("/content/gdrive/My Drive/Ver S/New/similarities.json", "r") as file:
    similarities_data = json.load(file)

In [23]:
with open("/content/gdrive/My Drive/Ver S/rsid_sentences.json", "r") as file:
    rsid_sentences = {}
    for line in file:
        rsid_sentences.update(json.loads(line.strip()))

In [24]:
# 2. Extract sentences with multiple RSIDs
rsid_pattern = re.compile(r'rs\d+')
multi_rsid_sentences = {key: [sentence for sentence in sentences if len(rsid_pattern.findall(sentence)) >= 2] for key, sentences in rsid_sentences.items() if any(len(rsid_pattern.findall(sentence)) >= 2 for sentence in sentences)}


In [25]:
# 3. Cross-check the top RSID pairs (excluding pairs with identical RSIDs)
filtered_rsid_pair_list = [pair for pair in top_rsid_pair_list if pair[0] != pair[1]]

cross_checked_data = {}
for key, sentences in multi_rsid_sentences.items():
    for sentence in sentences:
        for rsid_pair in filtered_rsid_pair_list:
            if all(rs in sentence for rs in rsid_pair) and key in similarities_data:
                if rsid_pair not in cross_checked_data:
                    cross_checked_data[rsid_pair] = {}
                cross_checked_data[rsid_pair][key] = similarities_data[key]


In [26]:
# DB  store the structured data in a database
conn = sqlite3.connect("/content/gdrive/My Drive/Ver S/New/similarities.db")
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS similarities (PMID INTEGER, rsid_pair TEXT, similarity_score REAL)''')

# Aggregate the data and compute the average similarity score
aggregated_data = {}
for rsid_pair, data in cross_checked_data.items():
    for PMID, similar_sentences in data.items():
        if (PMID, rsid_pair) not in aggregated_data:
            aggregated_data[(PMID, rsid_pair)] = []
        aggregated_data[(PMID, rsid_pair)].extend(similar_sentences.values())

# Calculate the average for each group and insert into the database
for (PMID, rsid_pair), scores in aggregated_data.items():
    avg_score = sum(scores) / len(scores)
    cursor.execute("INSERT INTO similarities (PMID, rsid_pair, similarity_score) VALUES (?, ?, ?)", (int(PMID), str(rsid_pair), avg_score))

# Order the data by PMID in ascending order
cursor.execute("SELECT * FROM similarities ORDER BY PMID ASC")

conn.commit()
conn.close()

print("db save success")

db save success


In [9]:
#End.