In [None]:
!git clone https://github.com/UKPLab/sentence-transformers

In [None]:
!pip install -U sentence-transformers

Before this:

1. BIO to json.ipynb

2. json to only certain label sentences.ipynb

3. extract only sentence from tokens.ipynb

sentence bert

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Load the pretrained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 1: Load and encode the test sentences
with open('/content/only_sentences_impacts_test.txt', 'r') as test_file:
    test_sentences = [line.strip() for line in test_file.readlines() if line.strip()]

# Encode the test sentences
test_embeddings = model.encode(test_sentences)

# Set the value of k for top-k retrieval
top_k = 5

# Define the directory containing the train files (labels directory)
labels_directory = '/content/labels'

# Step 2: Pre-compute embeddings for all training files
train_embeddings = {}
train_sentences_dict = {}

for filename in os.listdir(labels_directory):
    file_path = os.path.join(labels_directory, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as train_file:
            training_sentences = [line.strip() for line in train_file.readlines() if line.strip()]

        # Encode the training sentences in the current file
        training_embeddings = model.encode(training_sentences)

        # Store the embeddings and sentences in dictionaries for later use
        train_embeddings[filename] = training_embeddings
        train_sentences_dict[filename] = training_sentences

# Open the output file in append mode to write each result step-by-step
output_test_file_path = 'test_sentences_impacts_with_top_5_similar.txt'
with open(output_test_file_path, 'w') as output_test_file:

    # Step 3: For each test sentence, find the top-k most similar sentences from each train file
    for idx, test_sentence in enumerate(test_sentences):
        # Encode the current test sentence embedding
        test_embedding = test_embeddings[idx].reshape(1, -1)

        # Store top-k similar sentences from all files for the current test sentence
        all_retrieved_sentences = []

        # Step 4: Iterate through each precomputed training file embeddings
        for filename, training_embeddings in train_embeddings.items():
            training_sentences = train_sentences_dict[filename]

            # Step 5: Calculate cosine similarity between the test sentence and all training sentences in the current file
            similarities = cosine_similarity(test_embedding, training_embeddings)

            # Step 6: Get the top-K most similar sentences
            top_k_indices = np.argsort(similarities[0])[::-1][:top_k]

            # Fetch the top-k most similar sentences with their source file information
            retrieved_sentences = [f"{training_sentences[i]} (from {filename})" for i in top_k_indices]

            # Add the top-k sentences from the current file to the list for this test sentence
            all_retrieved_sentences.extend(retrieved_sentences)

        # Step 7: Combine all retrieved sentences into the desired output format
        formatted_output = f"Input Sentence: {test_sentence}\nTop-{top_k} Similar Sentences from labels:\n" + "\n".join(all_retrieved_sentences) + "\n\n"

        # Write the formatted output for the current test sentence to the output file
        output_test_file.write(formatted_output)

        # Print progress
        print(f"Processed test sentence {idx + 1}/{len(test_sentences)}")

print(f"Results saved to {output_test_file_path}")


tf-idf


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Step 1: Load the test sentences
with open('/content/only_sentences_impacts_test.txt', 'r') as test_file:
    test_sentences = [line.strip() for line in test_file.readlines() if line.strip()]

# Set the value of k for top-k retrieval
top_k = 10

# Define the directory containing the train files (labels directory)
labels_directory = '/content/labels'

# Step 2: Load all training sentences from all files
all_training_sentences = []
train_sentences_dict = {}

for filename in os.listdir(labels_directory):
    file_path = os.path.join(labels_directory, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as train_file:
            training_sentences = [line.strip() for line in train_file.readlines() if line.strip()]
            all_training_sentences.extend(training_sentences)
            train_sentences_dict[filename] = training_sentences

# Step 3: Initialize the TF-IDF vectorizer and fit it on all training sentences
vectorizer = TfidfVectorizer()
vectorizer.fit(all_training_sentences)

# Step 4: Transform the training sentences from each file and store their embeddings
train_embeddings = {}

for filename, training_sentences in train_sentences_dict.items():
    training_embeddings = vectorizer.transform(training_sentences)
    train_embeddings[filename] = training_embeddings

# Step 5: Transform the test sentences using the same vectorizer
test_embeddings = vectorizer.transform(test_sentences)

# Open the output file in append mode to write each result step-by-step
output_test_file_path = 'Impacts_test_sentences_with_top_10_similar_tf_idf.txt'
with open(output_test_file_path, 'w') as output_test_file:

    # Step 6: For each test sentence, find the top-k most similar sentences from each train file
    for idx, test_sentence in enumerate(test_sentences):
        # Get the TF-IDF vector for the current test sentence
        test_embedding = test_embeddings[idx].reshape(1, -1)

        # Store top-k similar sentences from all files for the current test sentence
        all_retrieved_sentences = []

        # Step 7: Iterate through each precomputed training file embeddings
        for filename, training_embeddings in train_embeddings.items():
            training_sentences = train_sentences_dict[filename]

            # Step 8: Calculate cosine similarity between the test sentence and all training sentences in the current file
            similarities = cosine_similarity(test_embedding, training_embeddings)

            # Step 9: Get the top-K most similar sentences
            top_k_indices = np.argsort(similarities[0])[::-1][:top_k]

            # Fetch the top-k most similar sentences with their source file information
            retrieved_sentences = [f"{training_sentences[i]} (from {filename})" for i in top_k_indices]

            # Add the top-k sentences from the current file to the list for this test sentence
            all_retrieved_sentences.extend(retrieved_sentences)

        # Step 10: Combine all retrieved sentences into the desired output format
        formatted_output = f"Input Sentence: {test_sentence}\nTop-{top_k} Similar Sentences from labels:\n" + "\n".join(all_retrieved_sentences) + "\n\n"

        # Write the formatted output for the current test sentence to the output file
        output_test_file.write(formatted_output)

        # Print progress
        print(f"Processed test sentence {idx + 1}/{len(test_sentences)}")

print(f"Results saved to {output_test_file_path}")


LSA (Latent Semantic Analysis)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Load and preprocess the training sentences
with open('/content/filtered_sentences_only_impacts_traind_dev.txt', 'r') as file:
    training_sentences = [line.strip() for line in file.readlines() if line.strip()]

# Step 2: Load the test sentences from the provided file
with open('/content/only_sentences_impacts_test.txt', 'r') as test_file:
    test_sentences = [line.strip() for line in test_file.readlines() if line.strip()]

# Step 3: Initialize the TF-IDF vectorizer and fit it to the training sentences
vectorizer = TfidfVectorizer()
training_tfidf = vectorizer.fit_transform(training_sentences)

# Step 4: Apply Latent Semantic Analysis (LSA) using Truncated SVD on the TF-IDF matrix
n_components = 100  # Number of latent semantic dimensions (this can be tuned)
svd = TruncatedSVD(n_components=n_components)
training_lsa = svd.fit_transform(training_tfidf)

# Step 5: Transform the test sentences using the same vectorizer and SVD model
test_tfidf = vectorizer.transform(test_sentences)
test_lsa = svd.transform(test_tfidf)

# Set the value of k for top-k retrieval
top_k = 10

# Store results for each test sentence
results = []

# Step 6: For each test sentence, calculate cosine similarity with the training sentences
for i, input_sentence in enumerate(test_sentences):
    # Compute cosine similarity between the current test sentence and all training sentences
    similarities = cosine_similarity(test_lsa[i].reshape(1, -1), training_lsa)

    # Get the indices of the top-K most similar training sentences
    top_k_indices = np.argsort(similarities[0])[::-1][:top_k]

    # Fetch the top-k most similar sentences from the training set
    retrieved_sentences = [training_sentences[idx] for idx in top_k_indices]

    # Store the input sentence and its retrieved top-k similar sentences
    results.append(f"Input Sentence: {input_sentence}\nTop-{top_k} Similar Sentences:\n" + "\n".join(retrieved_sentences) + "\n\n")

# Step 7: Write the results to a new file
output_test_file_path = 'test_sentences_with_top_10_similar_LSA.txt'
with open(output_test_file_path, 'w') as output_test_file:
    output_test_file.writelines(results)

print(f"Results saved to {output_test_file_path}")


Dense Passage Retrieval (DPR)

In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Load the pretrained DPR model and tokenizer for context and question encoding
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# Step 2: Load and preprocess the training sentences
with open('/content/filtered_sentences_only_impacts_traind_dev.txt', 'r') as file:
    training_sentences = [line.strip() for line in file.readlines() if line.strip()]

# Step 3: Encode the training sentences using the DPR context encoder
training_embeddings = []
for sentence in training_sentences:
    inputs = context_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    embeddings = context_model(**inputs).pooler_output
    training_embeddings.append(embeddings.detach().numpy())

# Convert list to numpy array for easier handling
training_embeddings = np.vstack(training_embeddings)

# Step 4: Load the test sentences from the provided file
with open('/content/only_sentences_impacts_test.txt', 'r') as test_file:
    test_sentences = [line.strip() for line in test_file.readlines() if line.strip()]

# Set the value of k for top-k retrieval
top_k = 10

# Store results for each test sentence
results = []

# Step 5: For each test sentence, encode it using the DPR question encoder
for input_sentence in test_sentences:
    inputs = question_tokenizer(input_sentence, return_tensors="pt", padding=True, truncation=True)
    input_embedding = question_model(**inputs).pooler_output.detach().numpy()

    # Step 6: Calculate cosine similarity between the input sentence and all training sentences
    similarities = cosine_similarity(input_embedding, training_embeddings)

    # Step 7: Get the top-K most similar sentences
    top_k_indices = np.argsort(similarities[0])[::-1][:top_k]

    # Fetch the top-k most similar sentences from the training set
    retrieved_sentences = [training_sentences[idx] for idx in top_k_indices]

    # Store the input sentence and its retrieved top-k similar sentences
    results.append(f"Input Sentence: {input_sentence}\nTop-{top_k} Similar Sentences:\n" + "\n".join(retrieved_sentences) + "\n\n")

# Step 8: Write the results to a new file
output_test_file_path = 'Impacts_test_sentences_with_top_10_similar_dpr.txt'
with open(output_test_file_path, 'w') as output_test_file:
    output_test_file.writelines(results)

print(f"Results saved to {output_test_file_path}")




step:

  1. for each test sentence, find Top-k similar sentences
  
  2. for each Top-k similar sentences, change to tokens and labels format
  
  3. put it into prompt



In [None]:
## check if any of the similar sentences are smaller than k
def check_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    current_input_sentence = None
    similar_sentences_count = 0

    for line in lines:
        line = line.strip()

        if line.startswith("Input Sentence:"):
            current_input_sentence = line.replace("Input Sentence:", "").strip()
            similar_sentences_count = 0

        elif line.startswith("Top-10 Similar Sentences:"):
            similar_sentences_count = 0

        elif line:
            similar_sentences_count += 1

        if similar_sentences_count > 0 and similar_sentences_count < 10 and (not line or line == "Input Sentence:"):
            print(f"Input sentence with less than 20 similar sentences: {current_input_sentence}")
            similar_sentences_count = 0  # Reset for the next input sentence

# Path to your file
file_path = '/content/impacts_test_sentences_with_top_10_similar_colbert.txt'

# Call the function to check and print sentences
check_sentences(file_path)


In [None]:
import json
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load the JSON data
json_file_path = '/content/train_dev'
test_file_path = '/content/test_sentences_with_top_10_similar_dpr.txt'
output_file_path = 'Impacts_output_with_matches_top_10_similar_dpr.txt'

# Load JSON data with tokens and labels
with open(json_file_path, 'r', encoding='utf-8') as f:
    json_data = [json.loads(line) for line in f]

# Load pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Create embeddings for JSON data
json_texts = [" ".join(item["text"]).lower() for item in json_data]
json_embeddings = model.encode(json_texts, convert_to_tensor=True)

# Cache dictionary to store best match information
best_match_cache = {}

# Function to find the best match based on similarity threshold
def find_best_match(similar_sentence, json_data, json_embeddings, model, threshold=0.5):
    # Check if similar_sentence already exists in the cache
    if similar_sentence in best_match_cache:
        return best_match_cache[similar_sentence]

    similar_embedding = model.encode(similar_sentence.lower(), convert_to_tensor=True)

    # Calculate cosine similarity with all json_embeddings
    similarities = util.pytorch_cos_sim(similar_embedding, json_embeddings)[0]

    # Get the indices sorted by similarity in descending order
    sorted_indices = similarities.argsort(descending=True)

    # Iterate through sorted indices to find the first match above threshold
    for index in sorted_indices:
        if similarities[index] >= threshold:
            best_match = json_texts[index]
            best_tokens = json_data[index]["text"]
            best_labels = json_data[index]["label"]
            # Cache the result
            best_match_cache[similar_sentence] = (best_tokens, best_labels, best_match, similarities[index].item())
            return best_tokens, best_labels, best_match, similarities[index].item()

    # If no match above threshold, return None and cache the result
    best_match_cache[similar_sentence] = (None, None, None, None)
    return None, None, None, None

# Process the test sentences and similar sentences
with open(test_file_path, 'r', encoding='utf-8') as test_file, open(output_file_path, 'w', encoding='utf-8') as output_file:
    lines = test_file.readlines()

    for i in range(0, len(lines), 13):
        original_sentence = lines[i].strip().replace("Input Sentence: ", "")
        #print(original_sentence)
        similar_sentences = [lines[i+j].strip() for j in range(2, 12)]

        output_file.write(f"Original Sentence: {original_sentence}\n")

        # Process each similar sentence and find the first best match
        for similar_sentence in similar_sentences:
            # Find best match for the similar sentence
            tokens, labels, match, similarity = find_best_match(similar_sentence, json_data, json_embeddings, model)
            if tokens and labels:
                output_file.write(f"Best Match: {match} (Similarity: {similarity:.2f})\n")
                output_file.write(f"Tokens: {tokens}\n")
                output_file.write(f"Labels: {labels}\n\n")
            else:
                output_file.write(f"No match found for: {similar_sentence}\n\n")

        output_file.write("\n" + "="*50 + "\n\n")

print(f"Output saved to {output_file_path}")


In [None]:
import json

# File paths
test_texts4_file = '/content/test_texts.txt'
output_file_path = '/content/Impacts_output_with_matches_top_10_similar_dpr.txt'
final_output_path = '/content/Impacts_output_with_matches_top_10_similar_dpr.txt'

# Load tokens from test_texts4 copy.txt
with open(test_texts4_file, 'r', encoding='utf-8') as f:
    tokens_data = [line.strip() for line in f.readlines()]  # Read each line as a string of tokens

# Load original sentences from the output file
with open(output_file_path, 'r', encoding='utf-8') as f:
    output_lines = f.readlines()

# Replace original sentences with tokens and write to the new file
token_idx = 0
with open(final_output_path, 'w', encoding='utf-8') as final_output:
    for line in output_lines:
        if line.startswith("Original Sentence:"):
            # Replace original sentence witah corresponding tokens from tokens_data
            #print(token_idx)
            #print(f"Original Sentence: {tokens_data[token_idx]}")
            final_output.write(f"Original Sentence: {tokens_data[token_idx]}\n\n")
            token_idx += 1  # Move to the next set of tokens
        else:
            # Keep the rest of the lines unchanged
            final_output.write(line)

print(f"Processed file saved to {final_output_path}")


In [None]:
import re

# Format the Best Match text
def format_best_match_text(data):
    tokens = data['Tokens']
    labels = data['Labels']

    # Reformat tokens with their corresponding labels
    formatted_list = []
    for i, word in enumerate(tokens):
        label = labels[i].replace('_', ' ').strip("'\"")
        if labels[i] != 'O':
            formatted_list.append(f"'{word}-{label}'")
        else:
            formatted_list.append(f"'{word}-O'")

    return formatted_list

# Extract the Best Match section from the file
def extract_best_match_from_file(file_content):
    matches = []
    match_blocks = re.findall(r"(Best Match:[\s\S]+?Labels: \[(.*?)\])", file_content)

    for block, label_string in match_blocks:
        tokens_match = re.findall(r"Tokens: \[(.*?)\]", block)

        if tokens_match:
            tokens = tokens_match[0].replace("'", "").split(", ")
            labels = label_string.split(", ")
            matches.append({'Tokens': tokens, 'Labels': labels})
        else:
            print("Tokens not found, skipping this block")

    return matches

# Step-by-step replace Labels in the file and write to the output file
def replace_and_write_file_content(input_file_path, output_file_path):
    # Read the content of the input file
    with open(input_file_path, 'r') as f:
        file_content = f.read()

    # Extract tokens and labels from the Best Match section
    best_matches = extract_best_match_from_file(file_content)

    # Open the output file for step-by-step writing
    with open(output_file_path, 'w') as output_file:
        current_pos = 0  # Current processing position

        # Iterate through all extracted Best Match blocks
        for idx, match in enumerate(best_matches):
            try:
                original_labels = ", ".join(match['Labels'])
                formatted_labels = ", ".join(format_best_match_text(match))

                # Locate the position of the current original_labels in the file content
                pos = file_content.find(original_labels, current_pos)
                if pos == -1:
                    print(f"Original labels not found: {original_labels}")
                    continue

                # Write the portion of the current file content up to the replacement point into the output file
                output_file.write(file_content[current_pos:pos])
                # write new labels
                output_file.write(formatted_labels)

                # Update current_pos to the next processing position
                current_pos = pos + len(original_labels)

                # Print progress information
                print(f"Processed {idx + 1}/{len(best_matches)} 个 Best Match")

            except Exception as e:
                print(f"An error occurred while processing {idx + 1} Best Match: {str(e)}")

        # Write the remaining unprocessed file content
        output_file.write(file_content[current_pos:])

    print(f"Updated file has been written {output_file_path}")

# file path
input_file_path = '/content/Impacts_output_with_matches_top_10_similar_dpr.txt'
output_file_path = '/content/Impacts_tokens_with_updated_labels_top_10_dpr.txt'

# Call the function to process and write step by step
replace_and_write_file_content(input_file_path, output_file_path)
