In [1]:
import pickle
import numpy as np
import pandas as pd

# Specify the file path where your embeddings were saved
#input_file = "data/scifact_evidence_embeddings.pkl"

input_file = "data/scifact_evidence_embeddings.pkl"
# Step 1: Load the pickled embeddings from disk
with open(input_file, "rb") as f:
    embeddings = pickle.load(f)

# Now, `embeddings` is a dictionary containing your evidence documents and their corresponding embeddings
# Example: Accessing an embedding
for doc, embedding in embeddings.items():
    doc_id, abstract = doc
    print("------")
    print(f"Document ID: {doc_id}")
    print(f"Abstract: {abstract}")
    print(f"Embedding: {embedding}\n")
    print(len(embedding))
    break
    

------
Document ID: 4983
Abstract: Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the close

In [2]:
import numpy as np
import faiss

# Assuming all embeddings have the same dimension
embedding_dim = len(next(iter(embeddings.values())))
index = faiss.IndexFlatL2(embedding_dim)

# Lists to store document IDs and abstracts
doc_ids = []
abstracts = []

# Add embeddings to the index
for doc, embedding in embeddings.items():
    doc_id, abstract = doc
    doc_ids.append(doc_id)
    abstracts.append(abstract)
    
    # Convert the embedding to a numpy array and reshape it
    embedding_np = np.array(embedding).reshape(1, -1).astype('float32')
    
    # Add the embedding to the index
    index.add(embedding_np)

# Now you have a FAISS index with all your embeddings
print(f"Total vectors in the index: {index.ntotal}")

# To perform a similarity search:
def similarity_search(query_embedding, index, doc_ids, abstracts, k=5):
    query_np = np.array(query_embedding).reshape(1, -1).astype('float32')
    distances, indices = index.search(query_np, k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'doc_id': doc_ids[idx],
            'abstract': abstracts[idx],
            'distance': distances[0][i]
        })
    
    return results

# Example usage:
query_embedding = embeddings[next(iter(embeddings))]  # Just using the first embedding as an example query
results = similarity_search(query_embedding,index, doc_ids, abstracts, k=5)

print("Search results:")
for result in results:
    print(f"Doc ID: {result['doc_id']}")
    print(f"Abstract: {result['abstract'][:100]}...")  # Printing first 100 characters of the abstract
    print(f"Distance: {result['distance']}")
    print()

# Optionally, save the index to disk
faiss.write_index(index, "your_faiss_index.idx")

# To load the index later:
# loaded_index = faiss.read_index("your_faiss_index.idx")

Total vectors in the index: 5183
Search results:
Doc ID: 4983
Abstract: Alterations of the architecture of cerebral white matter in the developing human brain can affect co...
Distance: 0.0

Doc ID: 19685306
Abstract: This paper proposes and tests a technique for imaging orientationally invariant indices of axon diam...
Distance: 0.7189399600028992

Doc ID: 1472815
Abstract: OBJECTIVE The purpose of our study was to investigate alterations of white matter integrity in adult...
Distance: 0.719285249710083

Doc ID: 1412089
Abstract: BACKGROUND Traditional T2 weighted MR imaging results are non-specific for the extent of underlying ...
Distance: 0.7198256254196167

Doc ID: 22107641
Abstract: OBJECTIVE The purpose of this study was to determine whether microstructural abnormalities in the wh...
Distance: 0.7959116697311401



In [3]:
import pandas as pd
claims_train = pd.read_json('scifact/claims_train.jsonl', lines=True)
# get value where id is 0
print(list(claims_train[claims_train['id'] == 0]['cited_doc_ids']))

[[31715818]]


In [4]:
input_file = "data/scifact_claim_embeddings.pkl"
with open(input_file, "rb") as f:
    claim_embeddings = pickle.load(f)
    
# Now, `claim_embeddings` is a dictionary containing your claim IDs and their corresponding embeddings

query_item = []
for claim_id, embedding in claim_embeddings.items():
    id_num, query = claim_id
    source = list(claims_train[claims_train['id'] == id_num]['cited_doc_ids'])[0]
    query_item.append((source, embedding))


    

In [31]:
def calculate_mAP(index, query_item, doc_ids, abstracts, max_items):
    output = []
    found_count = 0
    for item in query_item:
        source, embedding = item
        results = similarity_search(embedding,index, doc_ids, abstracts, k=max_items)
        source_count = len(source)
        count = 0
        result_count = 0
        precisions = []
        
        for result in results:
            result_count += 1
            if result['doc_id'] in source:
                count += 1
                found_count += 1
            
                precisions.append(count/result_count)
            if count == source_count:
                break
        if len(precisions) == 0:
            output.append(0)
        else:
            output.append(sum(precisions)/len(precisions))
    print(found_count / len(query_item))
    return sum(output)/len(output)
        
                
            

In [32]:
length = [1, 10, 50]
for l in length:
    print("mAP for top", l, "items:", calculate_mAP(index, query_item, doc_ids, abstracts, l))

0.5896168108776267
mAP for top 1 items: 0.5896168108776267
0.969097651421508
mAP for top 10 items: 0.6774373614299446
1.0852904820766378
mAP for top 50 items: 0.6785670461646436


cal

In [14]:
def calculate_MRR(index, query_item, doc_ids, abstracts, max_items):
    output = []
    for item in query_item:
        source, embedding = item
        results = similarity_search(embedding,index, doc_ids, abstracts, k=max_items)
        source_count = len(source)
        count = 0
        result_count = 0
        found = False
        for result in results:
            result_count += 1
            if result['doc_id'] in source:
                output.append(1/result_count)
                found = True
                break
                
        if not found:
            output.append(0)
        
    return sum(output)/len(output)

In [17]:
length = [1, 10, 50]
for l in length:
    print("MRR for top", l, "items:", calculate_MRR(index, query_item, doc_ids, abstracts, l))

MRR for top 1 items: 0.5896168108776267
MRR for top 10 items: 0.6816758883198929
MRR for top 50 items: 0.6858945383332591
