In [None]:
#Start

#### Code Summary

The sentences are tokenized and embeddings are extracted.  We Use unsupervised learning (kmeans) to segment tokenised sentences into 2 groups. The idea is that one group is positive associations and one group is negative
Use the labels 1 and 0 to fine tune the bert model (1 is not neccessarily positive). We later fine tune with the clustered data from KMeans.

A directed networkx graph is constructed to represent the relations. The RSIDs extracted from the sentences are added as nodes, and each relation's source/target RSIDs are connected by a directed edge. The Fruchterman-Reingold force-directed layout algorithm from networkx is used to position the nodes spatially in two dimensions. This helps reveal the network structure and connections.

To results are stored in local SQLite database file is initialized.

In [74]:
#Mount Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[0mCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
Coll

In [None]:
import pickle
import numpy as np
from sklearn.cluster import KMeans
from transformers import BertModel, BertTokenizer, BertTokenizerFast, BertForSequenceClassification, AdamW, BertConfig
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 1. Loading data
with open('/content/drive/MyDrive/CSIRO Text Mining/tokenized_rsid_sentences.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

In [None]:
# 2. Pre-processing
rsid_embeddings = loaded_data
embeddings_array = np.array([np.array(embedding[0]) for embedding in rsid_embeddings.values()])
NUM_CLUSTERS = 2
kmeans = KMeans(n_clusters=NUM_CLUSTERS)
pseudo_labels = kmeans.fit_predict(embeddings_array)

rsids = list(rsid_embeddings.keys())
labeled_data = list(zip(rsids, pseudo_labels))
labeled_dict = dict(labeled_data)

input_ids = []
attention_masks = []
labels = []

In [None]:
for rsid, token_ids in loaded_data.items():
    input_ids.append(torch.tensor(token_ids[0]))
    attention_mask = [1 if id != 0 else 0 for id in token_ids[0]]
    attention_masks.append(torch.tensor(attention_mask))

    label = labeled_dict.get(rsid, None)
    if label is None:
        raise ValueError(f"No label found for RSID {rsid}")
    labels.append(label)

input_ids = torch.stack(input_ids, dim=0)
attention_masks = torch.stack(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
# 3. Splitting dataset
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

In [None]:
# Load the rsid_tokens list from the specified file using pickle
file_path = "/content/drive/My Drive/CSIRO Text Mining/rsid_tokens.pkl"

with open(file_path, 'rb') as f:
    rsid_tokens = pickle.load(f)
print(f"Loaded rsid_tokens from {file_path}")

Loaded rsid_tokens from /content/drive/My Drive/CSIRO Text Mining/rsid_tokens.pkl


In [None]:
# Load tokenizer and model directly from the pre-trained version
tokenizer = BertTokenizerFast.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model = BertModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Add RSID tokens to tokenizer and resize model embeddings
tokenizer.add_tokens(rsid_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(189044, 768)

In [None]:
# Saving the tokenizer and model locally on Google Colab
tokenizer.save_pretrained('./local_tokenizer/')
model.save_pretrained('./local_model/')

In [None]:
# Setting up the device to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# 4. Initializing model and optimizer
model_path = './local_model/'

In [None]:
import gc
from torch.nn.utils import clip_grad_norm_

In [None]:
model = BertForSequenceClassification.from_pretrained(model_path).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./local_model/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Parameters
BATCH_SIZE = 1  # Batch size set to 1
GRADIENT_ACCUMULATE_EVERY = 1  # Update weights after each batch
MAX_GRAD_NORM = 1.0  # Gradient clipping

In [None]:
# Constants
GRADIENT_ACCUMULATE_EVERY = 4
MAX_GRAD_NORM = 1.0

labels = torch.tensor(labels).long()

# Create DataLoader
from torch.utils.data import DataLoader, TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

# Training loop with gradient accumulation
NUM_EPOCHS = 3
model.train()
optimizer.zero_grad()  # Reset gradients

for epoch in range(NUM_EPOCHS):
    total_loss = 0.0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device).long()  # Convert each batch of labels to long

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
        loss = outputs.loss
        loss = loss / GRADIENT_ACCUMULATE_EVERY  # Normalize loss
        loss.backward()

        total_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATE_EVERY == 0:
            clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  # Gradient clipping
            optimizer.step()
            optimizer.zero_grad()  # Reset gradients for the next accumulation

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Training Loss: {avg_train_loss}")

    # Free memory and force garbage collection
    del batch_input_ids, batch_attention_masks, batch_labels
    gc.collect()

Epoch 1/3 - Training Loss: 0.04435492484444888
Epoch 2/3 - Training Loss: 0.020995943361771713
Epoch 3/3 - Training Loss: 0.00863374314675369


In [None]:
torch.cuda.empty_cache()

In [None]:
# 6. Evaluation loop
model.eval()
total_eval_accuracy = 0

for batch in val_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=1)
    total_eval_accuracy += (predictions == batch_labels).sum().item()

avg_val_accuracy = total_eval_accuracy / len(val_dataset)
print(f"Validation Accuracy: {avg_val_accuracy:.2f}")

In [None]:
#Saving the model
model.save_pretrained("./my_model/")
tokenizer.save_pretrained("./my_tokenizer/")

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.txt',
 './my_tokenizer/added_tokens.json',
 './my_tokenizer/tokenizer.json')

In [None]:
!cp -r ./my_model/ "/content/drive/MyDrive/CSIRO Text Mining/"
!cp -r ./my_tokenizer/ "/content/drive/MyDrive/CSIRO Text Mining/"

In [None]:
# Predictions

In [None]:
import torch
import pickle
from torch.utils.data import DataLoader, TensorDataset
import gc
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig

In [None]:
if isinstance(loaded_data, dict):
    first_key = next(iter(loaded_data))
    print(first_key, ":", loaded_data[first_key])

8702993 : [[2, 42, 9510, 5953, 3056, 6903, 8637, 1786, 15, 116319, 15, 2029, 2488, 12010, 2108, 17, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# Display the first few items
sample_items = 5
for i, (pmid, loaded_data) in enumerate(loaded_data.items()):
    if i >= sample_items:
        break
    print(f"PMID: {pmid}")
    for sentence in loaded_data:
        print(tokenizer.decode(sentence))
    print('-' * 50)

PMID: 8702993
[CLS] a naturally occurring genetic variant hlpl, rs6684819, has normal bridge function. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [None]:
with open('/content/drive/MyDrive/CSIRO Text Mining/tokenized_rsid_sentences.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

In [None]:
# extract relations from tokenized abstracts
def extract_relations(tokenized_abstracts, model, device):
    relations = []

    # Loop through the tokenized abstracts
    for pmid, abstracts in tokenized_abstracts.items():
        for sentence_tokens in abstracts:
            # Tokenize the sentence for prediction
            inputs = {
                "input_ids": torch.tensor(sentence_tokens).unsqueeze(0).to(device),  # Move to GPU
                "attention_mask": torch.ones(len(sentence_tokens)).unsqueeze(0).to(device),  # Move to GPU
            }

            # Make a prediction with the model
            with torch.no_grad():
                inputs["input_ids"] = inputs["input_ids"].to(device)
                inputs["attention_mask"] = inputs["attention_mask"].to(device)
                outputs = model(**inputs)

            # Process the model's output
            # binary classification
            predicted_label = torch.argmax(outputs.logits, dim=1).item()
            confidence_score = torch.softmax(outputs.logits, dim=1)[0][predicted_label].item()
            sentence_text = tokenizer.decode(sentence_tokens, skip_special_tokens=True)
            relation = {
                "PMID": pmid,
                "Sentence": sentence_text,
                "Predicted_Label": predicted_label,
                "Confidence_Score": confidence_score,
            }
            relations.append(relation)

    return relations

In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('./my_tokenizer/')  # Load the saved tokenizer
model = BertForSequenceClassification.from_pretrained('./my_model/')  # Load the saved model

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(189044, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
extracted_relations = extract_relations(loaded_data, model, device)

In [None]:
extracted_relations[:5]

[{'PMID': '8702993',
  'Sentence': 'a naturally occurring genetic variant hlpl, rs6684819, has normal bridge function.',
  'Predicted_Label': 1,
  'Confidence_Score': 0.9288475513458252},
 {'PMID': '26733247',
  'Sentence': "in this study, the effect of astrocytic overexpression of a mutant app on the excitatory synaptic transmission was investigated using coculture system of the transgenic ( tg ) cortical astrocytes that express the human app695 polypeptide with the double mutation rs2122627 + rs910419302 found in a large swedish family with early onset alzheimer's disease, and wild - type hippocampal neuron.",
  'Predicted_Label': 1,
  'Confidence_Score': 0.9370633363723755},
 {'PMID': '17999203',
  'Sentence': 'study, we found that the rs11642015 allele was associated with polydipsia in our sample ( chi2 = 8. 00, df = 1, p = 0. 0047 ; or = 0. 53 ; 95 % ci = 0. 34 - 0. 83 ).',
  'Predicted_Label': 1,
  'Confidence_Score': 0.915960967540741},
 {'PMID': '12420099',
  'Sentence': 'clini

In [None]:
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the extracted relations data
output_path = '/content/drive/MyDrive/CSIRO Text Mining/extracted_relations.pkl'

# Store extracted_relations data in a file
with open(output_path, 'wb') as f:
    pickle.dump(extracted_relations, f)

# Confirm that the data is saved
print(f"Extracted relations data saved to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted relations data saved to: /content/drive/MyDrive/CSIRO Text Mining/extracted_relations.pkl


In [None]:
#Output

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(extracted_relations)

In [None]:
random_sample = df.sample(n=10)
print(random_sample)

           PMID                                           Sentence  \
15970  24509856  the rs41317471 receptor is a class a / rhodops...   
10905  30538125  recently, the rs75319568 snp ( asp84asn substi...   
14437  31428969  293t cells transfected with jak2 cdna carrying...   
17431  17126328  a human pdx - 1 mutation associated with diabe...   
13177   9731023  they had either elevated sweat chloride concen...   
9285   14711599  in a retrospective study of hiv patients under...   
15130  21511889  conversely, the association between one of the...   
15622  25561229  kras mutations do not appear to influence the ...   
10886  23036980  following thorough examinations, he was diagno...   
6196   19897031  we found a repressive effect of the rs39812237...   

       Predicted_Label  Confidence_Score  
15970                1          0.928926  
10905                1          0.956529  
14437                1          0.934695  
17431                1          0.925384  
13177         

In [None]:
#Visualise

In [1]:
import pickle
from google.colab import drive

# Mount Google Drive (if not already mounted)
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
# Define the path to the pkl file on Google Drive
pkl_file_path = '/content/drive/MyDrive/CSIRO Text Mining/extracted_relations.pkl'

# Read the extracted_relations from the pkl file
with open(pkl_file_path, 'rb') as pkl_file:
    extracted_relations = pickle.load(pkl_file)

In [41]:
import random
import re
import networkx as nx
import plotly.graph_objects as go

In [66]:
# Filter relations with high confidence scores, I'm using more that 96%
high_confidence_relations = [rel for rel in extracted_relations if rel['Confidence_Score'] > 0.96]

# Randomly select 10 relations from the high-confidence ones (or choose a different number)
random_10_relations = random.sample(high_confidence_relations, 10)

# Network visualization
G = nx.DiGraph()

# Add nodes (RSID) and edges (relations) to the graph
for relation in random_10_relations:
    sentence = relation['Sentence']
    rsids = re.findall(r'rs\d+', sentence)  # Extract RSIDs from the sentence
    source_rsid = rsids[0] if rsids else None  # Assuming the first RSID is the source
    target_rsid = rsids[1] if len(rsids) > 1 else None  # Assuming the second RSID is the target

    if source_rsid:
        G.add_node(source_rsid)

    if target_rsid:
        G.add_edge(source_rsid, target_rsid)

In [67]:
# Create a layout for the graph using the Fruchterman-Reingold layout
layout = nx.spring_layout(G, seed=42)

# Create node and edge traces for visualization
node_trace = go.Scatter(
    x=[layout[node][0] for node in G.nodes()],
    y=[layout[node][1] for node in G.nodes()],
    mode='markers+text',
    text=[node for node in G.nodes()],
    textposition='top center',
    marker=dict(
        size=10,
        color='blue',
    ),
)

edge_trace = go.Scatter(
    x=[layout[source_rsid][0] for source_rsid, target_rsid in G.edges()],
    y=[layout[source_rsid][1] for source_rsid, target_rsid in G.edges()],
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines',
)

In [68]:
# Create the figure with node and edge traces - Fruchterman-Reingold Layout
fig = go.Figure(data=[edge_trace, node_trace])

# Add title and axis labels
fig.update_layout(
    title="Network Visualization of 10 Sample RSID Relations",
    xaxis_title="X Coordinate (Fruchterman-Reingold Layout)",
    yaxis_title="Y Coordinate (Fruchterman-Reingold Layout)"
)


# Show the interactive graph
fig.show()

The axes in the graph generated by the provided code represent the spatial positions of the nodes in a two-dimensional plane. Specifically:

- The **x-axis** represents the horizontal position of the nodes.
- The **y-axis** represents the vertical position of the nodes.

These positions are determined by the `nx.spring_layout(G, seed=42)` line in the code, which computes the layout for the graph using the Fruchterman-Reingold force-directed algorithm. This algorithm tries to position the nodes in such a way that:
- Nodes connected by an edge are drawn close together.
- Nodes not connected by an edge are repelled apart.

The idea is to visualize the network in a way that reveals the structure and relationships between the nodes, making it easier to understand.

The axes do not represent any specific real-world measurements or scales. Instead, they are just spatial coordinates to help visualize the relationships and structures in the graph.

In [None]:
#Store Results to database

In [69]:
import sqlite3

In [70]:
database_file = '/content/drive/MyDrive/CSIRO Text Mining/RSID-Assosiations-1.db'

# Create a connection to the SQLite database
conn = sqlite3.connect(database_file)
cursor = conn.cursor()

# Create a table in the database
cursor.execute('''
    CREATE TABLE IF NOT EXISTS extracted_relations (
        PMID TEXT,
        Sentence TEXT,
        Predicted_Label INTEGER,
        Confidence_Score REAL,
        RSID TEXT
    )
''')

<sqlite3.Cursor at 0x7cc1ce74afc0>

In [71]:
# Extract RSID from the sentence
def extract_rsid(sentence):
    rsid_match = re.search(r'rs\d+', sentence)
    if rsid_match:
        return rsid_match.group(0)
    else:
        return None

In [72]:
# Insert data into the table
for data in extracted_relations:
    rsid = extract_rsid(data['Sentence'])
    cursor.execute('INSERT INTO extracted_relations (PMID, Sentence, Predicted_Label, Confidence_Score, RSID) VALUES (?, ?, ?, ?, ?)',
                   (data['PMID'], data['Sentence'], data['Predicted_Label'], data['Confidence_Score'], rsid))

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Data has been successfully inserted into the SQL database.")

Data has been successfully inserted into the SQL database.


In [73]:
#END