In [8]:
# References
# https://en.wikipedia.org/wiki/Okapi_BM25
# https://stackoverflow.com/questions/20510768/count-frequency-of-words-in-a-list-and-sort-by-frequency

import pandas as pd
import numpy as np
import re
from collections import Counter

# class IMDB_Query_Class
class IMDB_Query_Class:
    '''
    A class that implements the BM25 retrieval function for IMDB reviews.
    '''
    def __init__(self, test_docs):
        self.test_docs = self.preprocess_docs(test_docs) # preprocess test docs

    def fit(self, train_docs):
        self.train_docs = self.preprocess_docs(train_docs) # train docs
        self.avgdl = self.train_docs['review'].apply(lambda x: len(x.split())).mean() # compute avgdl
        self.N = train_docs.size # N total number of docs

    def idf(self, q_i):
        N = self.N # get N
        n_qi = len(self.train_docs[self.train_docs['review'].str.contains(q_i)]) # total number of train docs that include token qi
        idf = np.log ( (N - n_qi + 0.5) / (n_qi + 0.5) + 1 ) # idf calculation
        return idf

    def f_qi(self, q_i, D):
        counts = Counter(D) # count the words
        return counts[q_i] # return the count of q_i

    def score(self, q_doc):                
        Q = self.preprocess_doc(q_doc) # query doc
        self.test_docs['score'] = self.test_docs['review'].apply(self.bm25, args=(Q,)) # apply bm25 formula and append to score column

    def bm25(self, x, Q):
        x = x.split() # get tokens of D doc
        bm25 = 0
        b = self.b # get b
        avgdl = self.avgdl # get avgdl
        k = self.k # get k
        Q_tokens = Q.split() # split query doc in tokens
        for qi in Q_tokens: # loop through tokens
            f_qi = self.f_qi(qi, x) # compute f(qi, N)
            bm25 += self.idf(qi)*( (f_qi*(k+1)) / (f_qi+k*(1-b+b* (len(x)/avgdl) )) ) # compute bm25 formula
        return bm25

    def get_top_ten(self):
        # get the top ten matching reviews
        self.test_docs = self.test_docs.sort_values(by='score', ascending=False) # sort by score descending
        return self.test_docs.head(10).reset_index(drop=True)

    def set_k_b(self, k, b):
        # set the free variables
        self.k = k
        self.b = b
    
    def preprocess_docs(self, docs):
        # run preprocess function for each doc
        x = docs
        x['review'] = docs['review'].apply(lambda x: self.preprocess_doc(x))
        return x
    
    def preprocess_doc(self, doc):
        # remove special chars and punctuations
        return re.sub(r'[^\w\s]','', doc)

# set query doc and datasets
q_doc = "Well, since it's called Porno Holocaust and directed by Joe D'Amato, I went into this film expecting sleaze...and while I somewhat got it, Porno Holocaust was a massive disappointment as it's just so damned BORING. The title suggests that the film will feature porn, and that's not wrong - Porno Holocaust is pretty much just porn, and most of it is just the same stuff over and over again, I was fast forwarding before the end. The first sex scene is between two women and it got my hopes up, but after that it just degenerates into normal porn, and the rest of the film (for the first hour!) is made up of talking, and you can imagine how much fun that is to sit through! The plot focuses on a deserted island where, believe it or not, something strange is going on. Naturally, it's not long before a group of people - made up of a few men and some scientists, who all happen to be sexy women, land on the island. They have sex a few times and some strange things happen, then over an hour later they're attacked by a mutant zombie creature with an eye for the ladies...<br /><br />This must have seemed like a good idea for an original porno - a zombie who likes to get it on, but unsurprisingly it doesn't work well at all. The film clocks in at just ten minutes short of the two hour mark, and that is far too long for a film like this. I have no idea why Porno Holocaust is as long as it is; if they'd just snipped one minute out of every sex scene, the film would have been under ninety minutes, and that would have made it much more tolerable! The zombie takes what seems like an eternity to appear (it's quite a long time before there's a sex break long enough for them to actually travel to the island in the first place), and when it does finally appear, it's a huge disappointment! I realise that this is low budget B-movie trash, but D'Amato surely could have tried a bit harder and come up with something better than this! I'm not even going to bother mentioning the acting, atmosphere etc, there's no point. Porno Holocaust is basically just your average dull porn flick with a slight sprinkling of horror, and I can't recommend it!"
train_docs = pd.read_csv('imdb_train.csv').head(100)
test_docs = pd.read_csv('imdb_test.csv').head(100)

# run the class
bm = IMDB_Query_Class(test_docs)
bm.set_k_b(1.2, 0.75)
bm.fit(train_docs)
bm.score(q_doc)
bm.get_top_ten()


Unnamed: 0,review,score
0,One of the most sublime of American masterpiec...,114.160877
1,When Jim Wynorski first announced he would be ...,113.272473
2,What I hoped for or even expected was the well...,112.067772
3,Less than 10 minutes into this film I wanted i...,104.637512
4,I do not think I am alone when I say that 2005...,103.602482
5,Im a true fan of the original Cracker series a...,102.098806
6,I saw this when it first came to video my litt...,100.930495
7,The school nerd Marty Simon Scuddamore is sexu...,98.410959
8,I always believed that a film thats plot is ce...,98.365826
9,The Other Boleyn Girl not to be confused with...,94.301303
