In [None]:
# Import necessary library
import pandas as pd

# Load the data from the text file into a DataFrame
data = pd.read_csv('KGs/UMLS/train.txt', header=None, sep='\t', names=['Head', 'Relation', 'Tail'])

# Prepare a list to store counts
counts = []

# Iterate through each tuple in the DataFrame
for index, row in data.iterrows():
    # Count how many tuples have the same relation and tail
    count = ((data['Relation'] == row['Relation']) & (data['Tail'] == row['Tail'])).sum()
    # Append the count to the list
    counts.append(count)

# Create a new DataFrame from the counts
counts_df = pd.DataFrame(counts, columns=['Count'])

# Save the counts DataFrame to a new text file
counts_df.to_csv('KGs/UMLS/counts.txt', index=False, header=False)

# Print a confirmation message
print("Counts have been saved to counts.txt")


In [6]:
import re

# Define the input and output file paths
input_file = '/Users/abhayvaghasiya/Desktop/WORK/output_file_perturbed_KG_without_isA-1(1).txt'
output_file = '/Users/abhayvaghasiya/Desktop/WORK/cleaned_output.txt'

# Open the input file for reading and the output file for writing
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        # Use regex to remove the URL-like structures
        cleaned_line = re.sub(r'<https://dice-research.org/(\w+)>', r'\1', line)
        
        # Split the cleaned line into components (entity, relation, value)
        components = cleaned_line.split()
        
        if len(components) == 4:  # Ensure there are 3 elements plus a value
            # Join the parts with tabs and write to the output file
            formatted_line = '\t'.join(components[:3]) + '\t' + components[3] + '\n'
            outfile.write(formatted_line)

print(f"Cleaned and aligned data has been written to {output_file}")


Cleaned and aligned data has been written to /Users/abhayvaghasiya/Desktop/WORK/cleaned_output.txt


In [17]:
import pickle

# Load entity and relation mappings
with open('/Users/abhayvaghasiya/Desktop/WORK/dice_repos/dice-embeddings_loss_function/Experiments/2024-09-26 01-49-44.486888/entity_to_idx.p', 'rb') as f:
    entity_to_idx = pickle.load(f)

with open('/Users/abhayvaghasiya/Desktop/WORK/dice_repos/dice-embeddings_loss_function/Experiments/2024-09-26 01-49-44.486888/relation_to_idx.p', 'rb') as f:
    relation_to_idx = pickle.load(f)

# Function to map an entity or relation to its index
def get_index(mapping, item, item_type):
    formatted_item = item.lower().replace(' ', '_')
    if formatted_item in mapping:
        return mapping[formatted_item]
    else:
        print(f"Warning: {item_type} '{item}' not found in mapping.")
        return None

# Define the output file path
output_file_path = '/Users/abhayvaghasiya/Desktop/WORK/mapped_tuples.txt'

# Read the train.txt file and map tuples to indices, writing results to output file
with open('/Users/abhayvaghasiya/Desktop/WORK/UMLS/train.txt', 'r') as train_file, \
     open(output_file_path, 'w') as output_file:
    
    for line in train_file:
        head, relation, tail = line.strip().split('\t')
        
        head_idx = get_index(entity_to_idx, head, "Entity")
        relation_idx = get_index(relation_to_idx, relation, "Relation")
        tail_idx = get_index(entity_to_idx, tail, "Entity")
        
        if head_idx is not None and relation_idx is not None and tail_idx is not None:
            # Write the tuple with tabs and format the indices as a list
            output_file.write(f"{head}\t{relation}\t{tail}\t[{head_idx}, {relation_idx}, {tail_idx}]\n")

print(f"Mapped tuples have been written to {output_file_path}")


Mapped tuples have been written to /Users/abhayvaghasiya/Desktop/WORK/mapped_tuples.txt


In [15]:
# Define file paths for both input files and the output file
file1 = '/Users/abhayvaghasiya/Desktop/WORK/mapped_tuples.txt'  # File with tuples and indices
file2 = '/Users/abhayvaghasiya/Desktop/WORK/scaled_cleaned_output.txt'  # File with tuples and values
output_file = '/Users/abhayvaghasiya/Desktop/WORK/matched_tuples.txt'

# Read the contents of both files into dictionaries for easy lookup
mapped_tuples = {}
with open(file1, 'r') as f1:
    for line in f1:
        # Split by tab, assuming the format is head\trelation\ttail\t[indices]
        parts = line.strip().split('\t')
        if len(parts) == 4:
            key = f"{parts[0]}\t{parts[1]}\t{parts[2]}"  # Create the key from the tuple
            mapped_tuples[key] = parts[3]  # Store the indices as the value

# Open the second file and the output file for writing
with open(file2, 'r') as f2, open(output_file, 'w') as outfile:
    for line in f2:
        # Split by tab, assuming the format is head\trelation\ttail\tvalue
        parts = line.strip().split('\t')
        if len(parts) == 4:
            key = f"{parts[0]}\t{parts[1]}\t{parts[2]}"  # Create the key from the tuple
            value = parts[3]  # Get the value

            # If the tuple exists in both files, write the result
            if key in mapped_tuples:
                outfile.write(f"{mapped_tuples[key]}\t{value}\n")

print(f"Matched tuples have been written to {output_file}")


Matched tuples have been written to /Users/abhayvaghasiya/Desktop/WORK/matched_tuples.txt
