In [None]:
import json, os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List, Dict, Optional

from llama_index import Document

STAGING=True

# Helper functions for file handling (same as before)
def save_list_to_json(lst, filename):
    """ Save Files """
    with open(filename, 'w') as file:
        json.dump(lst, file)

def rm_file(file_path):
    """ Delete Files """
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"File {file_path} removed successfully.")

class JSONReader:
    """JSON reader."""
    def __init__(self, is_jsonl: Optional[bool] = False,) -> None:
        """Initialize with arguments."""
        super().__init__()
        self.is_jsonl = is_jsonl

    def load_data(self, input_file: str) -> List[Document]:
        """Load data from the input file."""
        documents = []
        with open(input_file, 'r') as file:
            load_data = json.load(file)
        for data in load_data:
            metadata = {"title": data['title'], 
                        "published_at": data['published_at'],
                        "source":data['source']}
            documents.append(Document(text=data['body'], metadata=metadata))
        return documents


    
def gen_tfidf(corpus, queries, output_name):
    print('Remove save file if exists.')
    rm_file(output_name)

    # Read the corpus json file
    reader = JSONReader()
    data = reader.load_data(corpus)
    
    print('Corpus Data')
    print('--------------------------')
    print(data[0])
    print('--------------------------')

    corpus_texts = [doc.text for doc in data]

    # Create TF-IDF matrix for the corpus
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus_texts)

    print('TF-IDF Initialized ...')

    # Parse the queries
    with open(queries, 'r') as file:
        query_data = json.load(file)

    print('Query Data')
    print('--------------------------')
    print(query_data[0])
    print('--------------------------')

    retrieval_save_list = []
    print("Running TF-IDF Retrieval ...")

    for data in tqdm(query_data):
        query = data['query']
        query_vector = vectorizer.transform([query])

        # Calculate cosine similarity between the query and documents
        scores = (X @ query_vector.T).toarray().flatten()

        # Get top 10 results based on cosine similarity scores
        top_results = sorted(zip(scores, corpus_texts), reverse=True)[:10]

        retrieval_list = []
        for score, text in top_results:
            dic = {
                'text': text,
                'score': score
            }
            retrieval_list.append(dic)

        # Save query, answers, and retrieved documents
        save = {
            'query': data['query'],
            'answer': data['answer'],
            'question_type': data.get('question_type', None),
            'retrieval_list': retrieval_list,
            'gold_list': data.get('evidence_list', [])
        }
        retrieval_save_list.append(save)

    print('Retrieval complete. Saving Results...')
    save_list_to_json(retrieval_save_list, output_name)

if __name__ == '__main__':
    if STAGING:
        corpus = "data/sample-corpus.json"
        queries = "data/sample-rag.json"
    else:
        corpus = "data/corpus.json"
        queries = "data/rag.json"
        
    output_name = "output/tfidf-retrieval.json"

    gen_tfidf(corpus, queries, output_name)
