In [139]:
import numpy as np
from collections import Counter

class BM25:
    def __init__(self, documents: list[str]):
        self.documents = documents
        self.bag_of_words = [term for doc in self.documents for term in doc.split()]
        self.document_count = len(documents)
        self.avg_document_length = sum(len(doc.split()) for doc in documents) / self.document_count
        self.term_counts = self.calculate_term_counts()
        self.k1 = 1.2
        self.b = 0.75
        self.alpha = 0.01

    def calculate_term_counts(self):
        term_counts = Counter()
        for document in self.documents:
            term_counts.update(document)
        return term_counts

    def calculate_idf(self, term: str):
        document_with_term_count = self.term_counts[term]
        return np.log((self.document_count - document_with_term_count + 0.5) / (document_with_term_count + 0.5) + self.alpha)

    def calculate_bm25_score(self, query: str, document: list[str]):
        score = 0.0
        document_length = len(document)
        tokens = query.split()
        query_terms = Counter(tokens)
        for term in query_terms:
            if term not in self.bag_of_words:
                continue
                print(f'{term} not in {self.documents}')
            idf = self.calculate_idf(term)
            term_frequency = document.count(term)
            numerator = term_frequency * (self.k1 + 1)
            denominator = term_frequency + self.k1 * (1 - self.b + self.b * (document_length / self.avg_document_length))
            score += idf * (numerator / denominator)

        return score

    def rank_documents(self, query: str):
        document_scores = []
        for document in self.documents:
            score = self.calculate_bm25_score(query, document)
            document_scores.append((document, score))
        
        ranked_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)
        return ranked_documents

In [115]:
import json
from preprocessing import FileIO
path = '../vsa_practice/practice_data/impact-theory-new-ft-model-256.parquet'
data = FileIO().load_parquet(path)

Shape of data: (26448, 12)
Memory Usage: 2.42+ MB


In [140]:
contents = [d['content'] for d in data]
docs = [contents[0], contents[1000], contents[10000], contents[20000]]
docs

["You said these are dangerous times. The world order is shifting before our eyes. We also both know that with hyper disruptive technologies like AI on the horizon, a good outcome is not guaranteed. Why do you think big tech will become the third superpower and what are the dangers and opportunities if it does? Big tech is essentially sovereign over the digital world. The fact that former President Trump was de-platformed from Facebook and from Twitter when he was president, you know, most powerful political figure on the planet. And he's just taken off of those networks and as a consequence, hundreds of millions of people that would be regularly engaging with him in real time suddenly can't see it. That wasn't a decision that was made by a government. It wasn't a decision made by a judge or by a regulatory authority or even by a multinational organization like, you know, the UN. It was made by individuals that own tech companies. The same thing is true in the decision to help Ukraine 

In [141]:
test = BM25(docs)
test.calculate_idf('the')

2.1983350716202463

In [138]:
Counter('power in war'.split())

Counter({'power': 1, 'in': 1, 'war': 1})

In [150]:
query = 'sadness melancholy'

In [151]:
test.rank_documents(query)

[("But if you're not focusing on it, then patience happens by default. Like sadness, for example, like that was really helped me to find, figure out just even defining the word in terms of operational perspective, helped me get out of those funks faster, which is sadness comes from a lack of options, a perceived lack of options, which is why it feels like hopelessness. But if it comes from a perceived lack of options, then it means that you solve that with knowledge because it's perceived lack of options, which is an ignorance problem, which means it's solvable, which all of a sudden gives me something to do. So then all of a sudden I do have an option and then you can get out of the funk. And like anxiety is the reverse of that, which is I have many options and I don't know which one to pick, which means I don't have priorities. So like you solve sadness through knowledge. You solve anxiety through decisions. And so like helping me just spell those out to myself. I'm like, ah, I feel 

In [32]:
def calculate_bm25_score(query: str, document: list[str]):
    score = 0.0
    document_length = len(document)
    query_terms = Counter(query)

    for term in query_terms:
        if term not in self.documents:
            continue
        idf = self.calculate_idf(term)
        term_frequency = document.count(term)
        numerator = term_frequency * (self.k1 + 1)
        denominator = term_frequency + self.k1 * (1 - self.b + self.b * (document_length / self.avg_document_length))
        score += idf * (numerator / denominator)

In [33]:
calculate_bm25_score('apple', documents)

NameError: name 'self' is not defined