In [None]:
import pandas as pd
import networkx as nx

path = 'Data/'

In [None]:
import pickle5 as pickle
import time

In [None]:
df = pd.read_parquet('{}file-name_query03-05.parquet'.format(path))
df

In [None]:
texts_df = pd.read_parquet('{}topics03-05.parquet'.format(path))
texts_df

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def create_phrases_from_sentences(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    phrases = []
    current_phrase = []
    current_length = 0
    
    for sentence in sentences:
        sentence_length = len(sentence)
        
        # If single sentence exceeds 512, keep it as a whole phrase
        if sentence_length > 512:
            if current_phrase:  # Store any accumulated phrases first
                phrases.append(' '.join(current_phrase))
                current_phrase = []
                current_length = 0
            phrases.append(sentence)
            continue
            
        # Check if adding this sentence would exceed 512 characters
        if current_length + sentence_length + 1 <= 512:  # +1 for space
            current_phrase.append(sentence)
            current_length += sentence_length + 1
        else:
            # Store current phrase and start new one
            if current_phrase:
                phrases.append(' '.join(current_phrase))
            current_phrase = [sentence]
            current_length = sentence_length
    
    # Add any remaining phrases
    if current_phrase:
        phrases.append(' '.join(current_phrase))
    
    return phrases

# Apply the function to create phrases_2 column
texts_df['phrases_2'] = texts_df['text'].apply(create_phrases_from_sentences)
texts_df

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
from tqdm.notebook import tqdm
import logging

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Load Sentence model (based on BERT) from URL
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

def model_encode(model):
    embeddings = []
    for _, phrases in tqdm(texts_df['phrases'].items()):
        embedding = np.zeros(512)
        for phrase in phrases:
            embedding += model.encode(phrase, show_progress_bar=False)
        embeddings.append(embedding)
    return embeddings

texts_df['embedding'] = model_encode(model)
texts_df

In [None]:
texts_df.to_parquet('{}topics_embeddings.parquet'.format(path))

In [None]:
import pandas as pd
import numpy as np
import re






def find_element_positions(phrase, element):
    """Find character and word positions of an element in a phrase."""
    if not isinstance(element, str) or pd.isna(element) or not element.strip():
        return None
    
    # Convert both to lowercase for case-insensitive matching
    phrase_lower = phrase.lower()
    element_lower = element.lower()
    
    # Find character position
    char_pos = phrase_lower.find(element_lower)
    if char_pos == -1:
        return None
        
    # Find word position
    words = phrase[:char_pos].split()
    start_word_pos = len(words)
    end_word_pos = start_word_pos + len(element_lower.split())
    

    return {
        'surface_char_pos': np.array([char_pos, char_pos + len(element_lower)]),
        'surface_form': element.lower(),
        'surface_word_pos': np.array([start_word_pos, end_word_pos]),
        'types': np.array([], dtype=object),
        'uri': element.lower()
    }

def process_entities(df, texts_df):
    # Merge the dataframes on doi
    merged_df = pd.merge(texts_df, df, on='doi', how='left')
    
    # Columns to check for entities
    entity_columns = ['bioActivity', 'collectionSpecie', 'collectionSite', 
                     'collectionType', 'name']
    
    data = []
    data_2 = []
    
    # Global counter for unique phrase IDs
    global_phrase_idx = 0
    
    # Iterate through each row
    for idx, row in merged_df.iterrows():
        if not isinstance(row['phrases_2'], list):
            continue
            
        # Process each phrase
        for phrase in row['phrases_2']:
            if not isinstance(phrase, str):
                continue
                
            phrase = re.sub("(\\d|\\W)+"," ",phrase)
            phrase = phrase.strip()

            data_2.append({
                'label': global_phrase_idx,
                'text_a': phrase.lower()
            })
                
            # Check each entity column
            for col in entity_columns:
                element = row[col]
                element = re.sub("(\\d|\\W)+"," ",element)
                element = element.strip()
                positions = find_element_positions(phrase, element)
                
                if positions:
                    data.append({
                        'source_id': global_phrase_idx,
                        'entity_data': positions,
                        'uri': element.lower()
                    })
            
            # Increment the global counter after processing each phrase
            global_phrase_idx += 1
    
    return data, data_2

# Example usage:
data, data_2 = process_entities(df, texts_df)

In [None]:
data

In [None]:
data_2

In [None]:
entities = pd.DataFrame(data)

In [None]:
texts = pd.DataFrame(data_2)

In [None]:
entities.to_parquet('../kims-bert/CODE/K-BERT/datasets/scholarly_dataset/scholarly__3000__train__entity_mapping_by_sentence.parquet')

In [None]:
texts.to_csv("../kims-bert/CODE/K-BERT/datasets/scholarly_dataset/scholarly__3000__train.tsv", sep="\t", index=False)

In [None]:
import pandas as pd
import pickle5 as pickle


# Create lists to store the SPO triples
spo_triples = []

# Function to add triple if value exists
def add_triple_if_exists(subject, predicate, object_value):
    if pd.notna(object_value) and object_value != "":
        spo_triples.append(f"{subject}\t{predicate}\t{object_value}")

# Process each row in the DataFrame
for _, row in df.iterrows():
    doi = row['doi']


    row['bioActivity'] = re.sub("(\\d|\\W)+"," ",row['bioActivity'])
    row['collectionSpecie'] = re.sub("(\\d|\\W)+"," ",row['collectionSpecie'])
    row['collectionSite'] = re.sub("(\\d|\\W)+"," ",row['collectionSite'])
    row['collectionType'] = re.sub("(\\d|\\W)+"," ",row['collectionType'])
    row['name'] = re.sub("(\\d|\\W)+"," ",row['name'])

    # Clean and trim each field
    row['bioActivity'] = row['bioActivity'].strip()
    row['collectionSpecie'] = row['collectionSpecie'].strip()
    row['collectionSite'] = row['collectionSite'].strip()
    row['collectionType'] = row['collectionType'].strip()
    row['name'] = row['name'].strip()

    
    # Add triples for each specified relationship
    add_triple_if_exists(row['bioActivity'].lower(), "bioActivity", doi)
    add_triple_if_exists(row['collectionSpecie'].lower(), "collectionSpecie", doi)
    add_triple_if_exists(row['collectionSite'].lower(), "collectionSite", doi)
    add_triple_if_exists(row['collectionType'].lower(), "collectionType", doi)
    add_triple_if_exists(row['name'].lower(), "name", doi)

# Write to .spo file
with open('../kims-bert/CODE/K-BERT/brain/kgs/KG_3cols.spo', 'w', encoding='utf-8') as f:
    f.write("sub\tpred\tobj\n")  # Header
    f.write('\n'.join(spo_triples))

print("Knowledge graph has been created in knowledge_graph.spo")