In [6]:
import re

# Define the input and output file paths
input_file = '/Users/abhayvaghasiya/Desktop/WORK/output_file_perturbed_KG_without_isA-1(1).txt'
output_file = '/Users/abhayvaghasiya/Desktop/WORK/cleaned_output.txt'

# Open the input file for reading and the output file for writing
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        # Use regex to remove the URL-like structures
        cleaned_line = re.sub(r'<https://dice-research.org/(\w+)>', r'\1', line)
        
        # Split the cleaned line into components (entity, relation, value)
        components = cleaned_line.split()
        
        if len(components) == 4:  # Ensure there are 3 elements plus a value
            # Join the parts with tabs and write to the output file
            formatted_line = '\t'.join(components[:3]) + '\t' + components[3] + '\n'
            outfile.write(formatted_line)

print(f"Cleaned and aligned data has been written to {output_file}")


Cleaned and aligned data has been written to /Users/abhayvaghasiya/Desktop/WORK/cleaned_output.txt


In [56]:
import pandas as pd
import pickle

# ***** Highlight: Add your paths here *****

# Path to 'train.txt'
train_file_path = '/Users/abhayvaghasiya/Desktop/WORK/Datasets_Perturbed_PI/KINSHIP/0.0/KINSHIP/train.txt'  # <-- Update this path

# Paths to entity and relation mappings (pickle files)
entity_to_idx_path = '/Users/abhayvaghasiya/Desktop/WORK/label_smoothing/KINSHIP/OUTPUT_file_KINSHIP_without_scalling/ComplEx_0.0_seed1/entity_to_idx.p'  # <-- Update this path
relation_to_idx_path = '/Users/abhayvaghasiya/Desktop/WORK/label_smoothing/KINSHIP/OUTPUT_file_KINSHIP_without_scalling/ComplEx_0.0_seed1/relation_to_idx.p'  # <-- Update this path

# Output paths
head_relation_tail_proportion_path = '/Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/Relation_tail_proportion.txt'  # <-- Update this path
mapped_tuples_path = '/Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/mapped_tuples.txt'  # <-- Update this path
matched_tuples_path = '/Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/matched_tuples.txt'  # <-- Update this path

# ***** End of path definitions *****

# Step 1: Load 'train.txt' and compute proportions
data = pd.read_csv(train_file_path, header=None, sep='\t', names=['Head', 'Relation', 'Tail'])

# Calculate the total counts of each relation in the dataset
relation_counts = data['Relation'].value_counts()

# Open a file to write the output
with open(head_relation_tail_proportion_path, 'w') as file:
    # Iterate through each tuple in the DataFrame
    for index, row in data.iterrows():
        # Count how many tuples have the same relation and tail
        count_relation_tail = ((data['Relation'] == row['Relation']) & (data['Tail'] == row['Tail'])).sum()
        # Retrieve the total count of the relation
        total_relation_count = relation_counts[row['Relation']]
        # Calculate the proportion
        proportion = count_relation_tail / total_relation_count
        # Apply the transformation to the proportion
        transformed_proportion = 0.8 + 0.2 * proportion
        # Format and write the tuple and its transformed proportion to the file
        file.write(f"{row['Head']}\t{row['Relation']}\t{row['Tail']}\t{transformed_proportion:.9f}\n")

# Print a confirmation message
print(f"Details including head, relation, tail, and transformed proportion have been saved to {head_relation_tail_proportion_path}")

# Step 2: Load entity and relation mappings
with open(entity_to_idx_path, 'rb') as f:
    entity_to_idx = pickle.load(f)

with open(relation_to_idx_path, 'rb') as f:
    relation_to_idx = pickle.load(f)

# Function to map an entity or relation to its index
def get_index(mapping, item, item_type):
    formatted_item = item.lower().replace(' ', '_')
    if formatted_item in mapping:
        return mapping[formatted_item]
    else:
        print(f"Warning: {item_type} '{item}' not found in mapping.")
        return None

# Map the tuples to indices and store in a dictionary
mapped_tuples = {}
for index, row in data.iterrows():
    head, relation, tail = row['Head'], row['Relation'], row['Tail']
    head_idx = get_index(entity_to_idx, head, "Entity")
    relation_idx = get_index(relation_to_idx, relation, "Relation")
    tail_idx = get_index(entity_to_idx, tail, "Entity")
    
    if head_idx is not None and relation_idx is not None and tail_idx is not None:
        key = f"{head}\t{relation}\t{tail}"
        indices = f"[{head_idx}, {relation_idx}, {tail_idx}]"
        mapped_tuples[key] = indices

# Write mapped tuples to file
with open(mapped_tuples_path, 'w') as output_file:
    for key, indices in mapped_tuples.items():
        head, relation, tail = key.split('\t')
        output_file.write(f"{head}\t{relation}\t{tail}\t{indices}\n")

print(f"Mapped tuples have been written to {mapped_tuples_path}")

# Step 3: Match tuples and write to 'matched_tuples.txt'
# Read 'head_relation_tail_proportion.txt' into a dictionary
proportion_tuples = {}
with open(head_relation_tail_proportion_path, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 4:
            key = f"{parts[0]}\t{parts[1]}\t{parts[2]}"
            value = parts[3]
            proportion_tuples[key] = value

# Open the output file for writing
with open(matched_tuples_path, 'w') as outfile:
    for key in mapped_tuples:
        if key in proportion_tuples:
            indices = mapped_tuples[key]
            value = proportion_tuples[key]
            outfile.write(f"{indices}\t{value}\n")

print(f"Matched tuples have been written to {matched_tuples_path}")


Details including head, relation, tail, and transformed proportion have been saved to /Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/Relation_tail_proportion.txt
Mapped tuples have been written to /Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/mapped_tuples.txt
Matched tuples have been written to /Users/abhayvaghasiya/Desktop/WORK/Without_coppel_Pi_score/KINSHIP_PI_score/0.2-0.8/0.0/matched_tuples.txt
