In [29]:
import json
# Open and read the JSON file
with open('Hog RAGger Dataset\corpus.json', 'r') as file:
    corpus_data = json.load(file)  # Parse the JSON data into a Python dictionary

  with open('Hog RAGger Dataset\corpus.json', 'r') as file:


In [30]:
def preprocess_article(article):
    # Normalize text (convert to lowercase, remove special characters, etc.)
    body = article['body'].lower().strip()
    
    # Tokenization (could be done using sentence transformers or other tokenizers)
    tokens = body.split()

    # Split long texts into chunks (e.g., 512 tokens each)
    max_length = 512
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

    return chunks

# Process the entire corpus
processed_corpus = []
for article in corpus_data:
    article_chunks = preprocess_article(article)
    for chunk in article_chunks:
        processed_corpus.append({
            "chunk": " ".join(chunk),
            "title": article['title'],
            "author": article['author'],
            "source": article['source'],
            "published_at": article['published_at'],
            "category": article['category'],
            "url": article['url']
        })


In [31]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the pre-trained sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create an empty list to store metadata
metadata_store = []

# Initialize FAISS index
embedding_dim = 384  # Depends on the model used
faiss_index = faiss.IndexFlatL2(embedding_dim)

# Iterate over the processed corpus and encode chunks
for doc in processed_corpus:
    # Encode the text chunk
    embedding = model.encode(doc['chunk'])
    
    # Add the embedding to the FAISS index
    faiss_index.add(np.array([embedding]))

    # Store corresponding metadata
    metadata_store.append({
        "title": doc['title'],
        "author": doc['author'],
        "source": doc['source'],
        "published_at": doc['published_at'],
        "category": doc['category'],
        "url": doc['url'],
        "chunk": doc['chunk']
    })

# Save FAISS index to disk for future use
faiss.write_index(faiss_index, "corpus_faiss.index")

# Save metadata store for future retrieval
import pickle
with open('metadata_store.pkl', 'wb') as f:
    pickle.dump(metadata_store, f)


In [32]:
query_text = "Who is the individual associated with cryptocurrency fraud?"
query_embedding = model.encode(query_text)

# Search FAISS for top 4 results
D, I = faiss_index.search(np.array([query_embedding]), k=4)

# Retrieve metadata for top results
for idx in I[0]:
    result = metadata_store[idx]
    print(f"Title: {result['title']}")
    print(f"Author: {result['author']}")
    print(f"Source: {result['source']}")
    print(f"Published At: {result['published_at']}")
    print(f"Body: {result['chunk'][:200]}...")
    print("-" * 50)


Title: The jury finally hears from Sam Bankman-Fried
Author: Elizabeth Lopatto
Source: The Verge
Published At: 2023-10-28T00:12:41+00:00
Body: it is honestly kind of incredible to watch a man torpedo his own credibility on direct testimony. we’re not even at the cross yet, and the judge has already instructed him to answer the question he’s ...
--------------------------------------------------
Title: Sam Bankman-Fried was a terrible boyfriend
Author: Elizabeth Lopatto
Source: The Verge
Published At: 2023-10-10T23:50:21+00:00
Body: i’ve got some shitty ex-boyfriends, but none of them made me the ceo of their sin-eater hedge fund while refusing to give me equity and bragging about how there was a 5 percent chance they’d become th...
--------------------------------------------------
Title: Is Sam Bankman-Fried a bad ‘man’ or a good ‘boy’? Lawyers swap opening statements before first witnesses take the stand
Author: Ben Weiss
Source: Fortune
Published At: 2023-10-04T23:32:53+00:00
Body: 

In [33]:
def aggregate_evidence(results, query):
    # Combine evidence from the top documents
    aggregated_evidence = []
    
    for result in results:
        evidence = {
            "title": result['title'],
            "author": result['author'],
            "source": result['source'],
            "published_at": result['published_at'],
            "url": result['url'],
            "fact": result['chunk'][:200]  # Extract part of the text relevant to the query
        }
        aggregated_evidence.append(evidence)
    
    return aggregated_evidence


In [34]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load T5 model and tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

def generate_answer(evidence_list, query):
    # Combine the evidence into a single input text
    evidence_text = " ".join([e['fact'] for e in evidence_list])
    input_text = f"question: {query} context: {evidence_text}"
    
    # Tokenize the input
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    
    # Generate the answer
    output_ids = t5_model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    answer = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return answer

def format_output(query, answer, evidence_list):
    output = {
        "query": query,
        "answer": answer,
        "question_type": "inference_query",  # As per the problem definition
        "evidence_list": evidence_list
    }
    return output

In [35]:
# Step 1: Encode the query
query_text = "Which individual is implicated in both inflating the value of a Manhattan apartment to a figure not yet achieved in New York City's real estate history, according to 'Fortune', and is also accused of adjusting this apartment's valuation to compensate for a loss in another asset's worth, as reported by 'The Age'?"
query_embedding = model.encode(query_text)

# Step 2: Search FAISS for top 4 results
D, I = faiss_index.search(np.array([query_embedding]), k=4)

# Step 3: Aggregate evidence from the top-ranked results
aggregated_evidence = aggregate_evidence([metadata_store[idx] for idx in I[0]], query_text)

# Step 4: Generate the answer using the aggregated evidence
answer = generate_answer(aggregated_evidence, query_text)

# Step 5: Format the output
final_output = format_output(query_text, answer, aggregated_evidence)

# Print the final output in the required format
import json
print(json.dumps(final_output, indent=4))



{
    "query": "Which individual is implicated in both inflating the value of a Manhattan apartment to a figure not yet achieved in New York City's real estate history, according to 'Fortune', and is also accused of adjusting this apartment's valuation to compensate for a loss in another asset's worth, as reported by 'The Age'?",
    "answer": "trump",
    "question_type": "inference_query",
    "evidence_list": [
        {
            "title": "The $777 million surprise: Donald Trump is getting richer",
            "author": "Tom Maloney",
            "source": "The Age",
            "published_at": "2023-11-07T22:22:05+00:00",
            "url": "https://www.theage.com.au/business/companies/the-777-million-surprise-donald-trump-is-getting-richer-20231108-p5eicf.html?ref=rss&utm_medium=rss&utm_source=rss_business",
            "fact": "million the residential condo tower on manhattan\u2019s upper east side, formerly the hotel delmonico, isn\u2019t one of trump\u2019s best-known buildi

In [36]:
import json
from transformers import pipeline

# Load train.json dataset
with open('Hog RAGger Dataset/train.json', 'r') as file:
    train_data = json.load(file)

# Load zero-shot classification model for yes/no detection
classifier = pipeline("text-classification", model="typeform/distilbert-base-uncased-mnli")

class YesNoClassifier:
    def __init__(self):
        # Initialize a text classification pipeline with a pretrained model
        self.classifier = pipeline("text-classification", model="mrm8488/bert-small-finetuned-squadv2")

    def is_yes_no_question(self, query):
        # Define a set of simple rules to check if the query is a yes/no question
        yes_no_keywords = ['is', 'are', 'do', 'does', 'did', 'can', 'will', 'was', 'were', 'has', 'have', 'had', 'shall', 'should', 'could']
        return any(query.lower().startswith(keyword) for keyword in yes_no_keywords)

class YesNoEvidenceClassifierLLM:
    def __init__(self):
        # Initialize a binary classification pipeline with a suitable model
        self.classifier = pipeline("text-classification", model="mrm8488/bert-small-finetuned-squadv2")

    def classify(self, data):
        answer = data["answer"].strip().lower()  # Normalize the answer
        query = data["query"]
        evidence_list = data["evidence_list"]

        # Concatenate the query and evidence facts into a single context
        evidence_texts = " ".join([evidence["fact"] for evidence in evidence_list])
        full_context = f"Question: {query} Evidence: {evidence_texts} Answer: {answer}"

        # Run classification
        result = self.classifier(full_context)

        # Interpret the result based on predicted label
        label = result[0]['label']
        score = result[0]['score']

        # Determine if the model agrees with the expected answer
        if label == 'positive' and answer == 'yes':
            return "YES"
        elif label == 'negative' and answer == 'no':
            return "NO"
        else:
            return "Insufficient Information"

yesNO = YesNoEvidenceClassifierLLM()
classifier = YesNoClassifier()


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mrm8488/bert-small-finetuned-squadv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [None]:
# Initialize counters for accuracy
correct_predictions = 0
total_queries = len(train_data)

# Loop through each query in the dataset
for item in train_data:
    query = item['query']
    true_answer = item['answer']
    # Encode the query, search FAISS, and aggregate evidence
    query_embedding = model.encode(query)
    D, I = faiss_index.search(np.array([query_embedding]), k=5)
    top_docs = [metadata_store[idx] for idx in I[0]]
    aggregated_evidence = aggregate_evidence(top_docs, query)
    # Step 1: Check if the query is a yes/no question
    if classifier.is_yes_no_question(query):
        # print("Y/N",end=",  ")
        # Generate the answer and calculate confidence
        answer = generate_answer(aggregated_evidence, query)
        final= format_output(query, answer, aggregated_evidence)
        final_answer = yesNO.classify(final)

    else:
        # print("Sub",end=",  ")
        # If not a yes/no question, use your original pipeline
        final_answer = generate_answer(aggregated_evidence, query).strip().lower()

    # Step 4: Check if the generated answer matches the true answer
    if final_answer == true_answer.strip().lower():
        correct_predictions += 1
    
    # Print results for inspection
    # print(f"Actual: {true_answer}, Predicted: {final_answer}")
    print(f"Actual: {true_answer}, Predicted: {final_answer}")
    # if((true_answer=="Yes" or true_answer=="No")):
    #     print(f"Query: {query}")
    #     print(f"Actual: {true_answer}, Predicted: {final_answer}")

# Step 5: Calculate accuracy
accuracy = correct_predictions / total_queries * 100
print(f"Accuracy: {accuracy:.2f}%")
