In [None]:

import json

# Load entity mappings from entity2text.txt
entity_dict = {}
with open('entity2text.txt', 'r', encoding='utf-8') as f:
    for line in f:
        entity_id, entity_text = line.strip().split('\t')
        entity_dict[entity_id] = entity_text

# Load relation mappings from alignment_clean.txt

with open('alignment_clean.txt', 'r', encoding='utf-8') as f:
    relation_dict = json.load(f)

def replace_in_file(input_file, output_file, max_lines=None):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            if max_lines is not None and i >= max_lines:
                break

            parts = line.strip().split('\t')

            # Map the head, relation, and tail entities
            head_entity = entity_dict.get(parts[0], parts[0])
            relation_description = relation_dict.get(parts[1], parts[1])
            tail_entity = entity_dict.get(parts[2], parts[2])

            # Write each entry to the output file
            outfile.write(f"Head: {head_entity}\nRelation: {relation_description}\nTail: {tail_entity}\n\n")

# Replace entities and relations in train.tsv
replace_in_file('train.tsv', 'train_replaced.txt')

# Replace entities and relations in test.tsv
replace_in_file('test.tsv', 'test_replaced.txt')

In [None]:
def load_entities_from_train(train_file, max_lines=100000):
    entities = set()
    with open(train_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                head_entity, tail_entity = parts[0], parts[2]
                entities.add(head_entity)
                entities.add(tail_entity)
    return entities

def filter_test_file(test_file, output_file, train_entities):
    with open(test_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                head_entity, tail_entity = parts[0], parts[2]
                if head_entity in train_entities and tail_entity in train_entities:
                    outfile.write(line)

# Load entities from the first 100,000 rows of train.tsv
train_entities = load_entities_from_train('train.tsv')

#train_entities = random.sample(train_entities, 200)

# Filter test.tsv based on these entities
filter_test_file('test.tsv', 'filtered_test.tsv', train_entities)

In [None]:
import random

def load_entities_from_train(train_file, max_lines=100000):
    entities = set()
    with open(train_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= max_lines:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                head_entity, tail_entity = parts[0], parts[2]
                entities.add(head_entity)
                entities.add(tail_entity)
    return entities

def filter_test_file(test_file, output_file, train_entities):
    unique_relations = set()
    with open(test_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                head_entity, relation, tail_entity = parts[0], parts[1], parts[2]
                if head_entity in train_entities and tail_entity in train_entities:
                    unique_relations.add(relation)
                    outfile.write(line)
    return len(unique_relations)

# Load entities from the first 100,000 rows of train.tsv
train_entities = load_entities_from_train('train.tsv')

# Iteratively sample entities until we achieve around 100 unique relations
desired_relations_count = 100
sample_size = 10

while True:
    # Randomly sample a subset of entities
    sampled_entities = random.sample(train_entities, min(sample_size, len(train_entities)))

    # Filter test.tsv based on these sampled entities
    num_relations = filter_test_file('test.tsv', 'filtered_test.tsv', sampled_entities)

    # Check if we have around 100 unique relations
    if num_relations >= desired_relations_count:
        print(f"Achieved {num_relations} unique relations with {len(sampled_entities)} sampled entities.")
        break

    # Increase sample size incrementally to reach desired relation count
    sample_size += 10

# Output the final set of sampled entities
print("Final set of sampled entities:", sampled_entities)

In [None]:
# Load entity mappings from entity2text.txt
entity_dict = {}
with open('entity2text.txt', 'r', encoding='utf-8') as f:
    for line in f:
        entity_id, entity_text = line.strip().split('\t')
        entity_dict[entity_id] = entity_text

# Load relation mappings from alignment_clean.txt
with open('alignment_clean.txt', 'r', encoding='utf-8') as f:
    relation_dict = json.load(f)

def replace_and_format_for_llm(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                head_entity_id, relation_id = parts[0], parts[1]

                # Replace IDs with text descriptions
                head_entity = entity_dict.get(head_entity_id, head_entity_id)
                relation_description = relation_dict.get(relation_id, relation_id)

                # Write the formatted entry to the output file
                outfile.write(f"Head: {head_entity}\nRelation: {relation_description}\nTail: [Predict the missing entity]\n\n")

# Replace entities and relations in test.tsv and prepare for LLM
replace_and_format_for_llm('filtered_test.tsv', 'llm_input.txt')

In [None]:
def extract_tails_from_predict(predict_file):
    tails = []
    with open(predict_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('Tail: '):
                tail = line[len('Tail: '):].strip()
                tails.append(tail)
    return tails

def extract_tails_from_test(test_file, num_rows=55):
    tails = []
    with open(test_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_rows:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                tail_entity = parts[2]
                tails.append(tail_entity)
    return tails

def match_tails(predict_tails, test_tails):
    matches = []
    for test_tail in test_tails:
        if test_tail in predict_tails:
            matches.append(test_tail)
    return matches

# Extract tails from predict.txt
predict_tails = extract_tails_from_predict('predict.txt')

# Extract tails from the first 55 rows of test.tsv
test_tails = extract_tails_from_test('/content/test_replaced1.tsv')

# Match tails between predict.txt and test.tsv
matched_tails = match_tails(predict_tails, test_tails)

# Output the matched tails
print("Matched Tails:", matched_tails)

In [None]:
def extract_tails_from_predict(predict_file):
    tails = []
    with open(predict_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('Tail: '):
                tail = line[len('Tail: '):].strip()
                tails.append(tail)
    return tails

def extract_tails_from_test(test_file, num_rows=55):
    tails = []
    with open(test_file, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_rows:
                break
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                tail_entity = parts[2]
                tails.append(tail_entity)
    return tails

def match_tails(predict_tails, test_tails):
    matches = [test_tail for test_tail in test_tails if test_tail in predict_tails]
    return matches

def calculate_accuracy(matched_tails, total_tails):
    if total_tails == 0:
        return 0.0
    return (len(matched_tails) / total_tails) * 100

# Extract tails from predict.txt
predict_tails = extract_tails_from_predict('predict.txt')

# Extract tails from the first 55 rows of test.tsv
test_tails = extract_tails_from_test('test_replaced1.tsv')

# Match tails between predict.txt and test.tsv
matched_tails = match_tails(predict_tails, test_tails)

# Calculate accuracy
accuracy = calculate_accuracy(matched_tails, len(test_tails))

# Output the matched tails and accuracy
print("Matched Tails:", matched_tails)
print(f"Accuracy: {accuracy:.2f}%")