In [None]:
!git clone https://github.com/stanford-futuredata/ColBERT.git

In [None]:
cd ColBERT

In [None]:
!pip install ujson

In [None]:
!pip install colbert

In [None]:
!pip install colbert-ai[torch,faiss-gpu]

In [None]:
!rm -r /content/colbertv2.0

In [None]:
!tar -xvzf /content/colbertv2.0.tar.gz -C /content/

In [None]:
# Convert the training dataset to ColBERT collection format
with open('/content/filtered_sentences_only_impacts_traind_dev.txt', 'r') as file:
    training_sentences = [line.strip() for line in file.readlines() if line.strip()]

# Write the training data in collection format, with the format: id\tSentence
collection_path = "/content/train_impacts_collection.tsv"
with open(collection_path, 'w') as f:
    for idx, sentence in enumerate(training_sentences):
        f.write(f'{idx}\t{sentence}\n')


In [None]:
# Convert the test dataset to ColBERT collection format
with open('/content/only_sentences_impacts_test.txt', 'r') as file:
    test_sentences = [line.strip() for line in file.readlines() if line.strip()]

# Write the test data in collection format, with the format: id\tSentence
query_path = "/content/queries.tsv"
with open(query_path, 'w') as f:
    for idx, sentence in enumerate(test_sentences):
        f.write(f'{idx}\t{sentence}\n')

In [None]:
import logging
import sys

# Set logging format and output level
#logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

# Enable more detailed logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)


from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Indexer

if __name__ == '__main__':
    # Step 1: Create the ColBERT runtime environment
    with Run().context(RunConfig(nranks=1, experiment="my_experiment", avoid_fork_if_possible=True)):

        # Step 2: Configure ColBERT settings
        config = ColBERTConfig(
            nbits=2,  # Compression bits set to 2
            root="/content/ColBERT/experiments",  # Root directory for experiment output files
        )

        # Step 3: Create an Indexer object and load the model checkpoint
        indexer = Indexer(
            checkpoint="/content/colbertv2.0/",  # Path to the model checkpoint
            config=config
        )

        # Step 4: Build the index and specify the collection.tsv file path
        indexer.index(
            name="my_train_impacts_collection.nbits=2",  # Name of the index
            collection="/content/train_impacts_collection.tsv"  # Path to the collection file
        )


In [None]:
from colbert.data import Queries
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Searcher
import os

# Step 1: Configure the runtime environment and ColBERT settings
with Run().context(RunConfig(nranks=1, experiment="my_experiment")):

    config = ColBERTConfig(
        root="/content/ColBERT/experiments",  # Path for experiment results
    )

    # Step 2: Load test sentences into Queries
    queries = Queries('/content/queries.tsv')

    top_k = 10  # Number of sentences to retrieve from each collection file

    # Define the directory containing training set files
    labels_directory = '/content/labels_collections'

    results = []

    # Step 3: Perform retrieval for each query sentence
    for qid, query_text in queries.items():
        all_retrieved_sentences = []  # To store similar sentences from multiple collection files

        # Iterate through each collection file, load the corresponding index, and perform retrieval
        for filename in os.listdir(labels_directory):
            file_path = os.path.join(labels_directory, filename)

            # If the current file is a collection file, perform retrieval
            if os.path.isfile(file_path) and filename.endswith('.tsv'):
                # Extract the index name, e.g., my_label1.nbits=2
                index_name = f"my_{filename.split('.')[0]}.nbits=2"

                # Initialize the Searcher and load the generated index
                searcher = Searcher(index=f"/content/ColBERT/experiments/my_experiment/indexes/{index_name}", config=config)

                # Perform retrieval for the current query sentence
                pids, ranks, scores = searcher.search(query_text, k=top_k)

                # Retrieve document IDs and content, and save the results
                retrieved_sentences = [f"{searcher.collection[pid]} (from {filename})" for pid in pids[:top_k]]  # Only fetch the top_k similar sentences

                # Add the similar sentences from the current collection to the overall results
                all_retrieved_sentences.extend(retrieved_sentences)

        # Save the input sentence and all retrieval results
        results.append(f"Input Sentence: {query_text}\nTop-{top_k * len(os.listdir(labels_directory))} Similar Sentences:\n" + "\n".join(all_retrieved_sentences) + "\n\n")

        print(f"Processed test sentence {qid + 1}/{len(queries.items())}")

    # Step 4: Save the results to a file
    output_test_file_path = '/content/Impacts_test_sentences_with_top_10_similar_colbert.txt'
    with open(output_test_file_path, 'w') as output_test_file:
        output_test_file.writelines(results)

    print(f"Results saved to {output_test_file_path}")
