## Utils

In [2]:
import numpy as np

def remove_html_tags(text):
    result = []
    inside_tag = False
    for char in text:
        if char == '<':
            inside_tag = True
        elif char == '>':
            inside_tag = False
        elif not inside_tag:
            result.append(char)
    return ''.join(result)


def remove_https_links(text):
    words = text.split()
    filtered_words = []
    for word in words:
        if not word.startswith('https://'):
            filtered_words.append(word)
    return ' '.join(filtered_words)


def preprocess_text(text):
    text = remove_https_links(text)
    text = remove_html_tags(text)
    text = ''.join([char if char.isalnum() or char.isspace()
                   else ' ' for char in text])
    text = text.lower()
    text = ' '.join(text.split())
    return text


def cosine_similarity(tfidf_matrix_1, tfidf_matrix_2):
    dot_product = np.dot(tfidf_matrix_1, tfidf_matrix_2.T)
    norm_1 = np.linalg.norm(tfidf_matrix_1, axis=1, keepdims=True)
    norm_2 = np.linalg.norm(tfidf_matrix_2, axis=1, keepdims=True)
    norm_1[norm_1 == 0] = 1e-9
    norm_2[norm_2 == 0] = 1e-9
    similarity = dot_product / (norm_1 * norm_2.T)
    return similarity


## Model

### BM25

In [9]:
import math

class BM25:
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=1):
        self.k1 = k1
        self.b = b
        self.corpus_size = 0
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            corpus = self._tokenize_corpus(corpus)

        nd = self.compute_df(corpus)
        self.compute_idf(nd)

    def compute_df(self, corpus):
        nd = {}
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                nd[word] = nd.get(word, 0) + 1

            self.corpus_size += 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def compute_idf(self, nd):
        for word, freq in nd.items():
            self.idf[word] = math.log(
                (self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score


### Vector Model(TF-IDF)

In [4]:
class TFIDFModel:
    def __init__(self):
        self.vocab_index = {}
        self.idf_values = None

    def build_vocabulary(self, tokenized_texts):
        vocab = set([token for tokens in tokenized_texts for token in tokens])
        self.vocab_index = {word: i for i, word in enumerate(vocab)}

    def compute_tf(self, tokenized_texts):
        tf_matrix = np.zeros(
            (len(tokenized_texts), len(self.vocab_index)), dtype=int)
        for i, tokens in enumerate(tokenized_texts):
            for token in tokens:
                if token in self.vocab_index:
                    tf_matrix[i, self.vocab_index[token]] += 1
        doc_lengths = np.array([len(tokens) for tokens in tokenized_texts])
        return tf_matrix / doc_lengths[:, None]

    def compute_idf(self, tf_matrix):
        N = len(tf_matrix)
        df_count = np.sum(tf_matrix > 0, axis=0)
        self.idf_values = np.log(N / (1 + df_count))

    def fit_transform(self, tokenized_corpus, tokenized_queries):
        self.build_vocabulary(tokenized_corpus)

        doc_tf_matrix = self.compute_tf(tokenized_corpus)
        query_tf_matrix = self.compute_tf(tokenized_queries)

        self.compute_idf(doc_tf_matrix)
        doc_tfidf_matrix = doc_tf_matrix * self.idf_values
        query_tfidf_matrix = query_tf_matrix * self.idf_values

        return doc_tfidf_matrix, query_tfidf_matrix


## Test for train_question csv

### BM25

In [5]:
import pandas as pd
df_docs = pd.read_csv("documents_data.csv")
df_train = pd.read_csv("train_question.csv")

corpus = list(df_docs['Document_HTML'])
query = list(df_train['Question'])

processed_corpus = [preprocess_text(doc) for doc in corpus]

tokenized_corpus = [doc.split(" ") for doc in processed_corpus]

bm25 = BM25(tokenized_corpus)
hit = 0
for i, q in enumerate(query):
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(doc_scores)[::-1][:3]
    print(f"Query {i} top 3: {top_n_indices}")

    if i in top_n_indices:
        hit += 1
print('Recall@3 accuracy : ', hit/len(query)*100)


Query 0 top 3: [198   0 543]
Query 1 top 3: [  1 460 632]
Query 2 top 3: [  2 674 388]
Query 3 top 3: [148 541 779]
Query 4 top 3: [  4   9 324]
Query 5 top 3: [889 942  53]
Query 6 top 3: [  6 328 786]
Query 7 top 3: [  7 423 898]
Query 8 top 3: [573   8 200]
Query 9 top 3: [  9 623   4]
Query 10 top 3: [611 674 116]
Query 11 top 3: [ 11 342 854]
Query 12 top 3: [ 12 963 228]
Query 13 top 3: [ 13 342 540]
Query 14 top 3: [ 14 838 759]
Query 15 top 3: [ 15 868 838]
Query 16 top 3: [ 16 975 623]
Query 17 top 3: [618  17 800]
Query 18 top 3: [ 18 714 679]
Query 19 top 3: [ 19 699 595]
Query 20 top 3: [ 20 845 485]
Query 21 top 3: [ 21  53 433]
Query 22 top 3: [ 22 463 802]
Query 23 top 3: [ 23 831 879]
Query 24 top 3: [ 24  50 265]
Query 25 top 3: [ 25 565 280]
Query 26 top 3: [ 26 969 627]
Query 27 top 3: [ 27 292  72]
Query 28 top 3: [ 28 954   4]
Query 29 top 3: [786  29 646]
Query 30 top 3: [ 30 998   0]
Query 31 top 3: [905  31 175]
Query 32 top 3: [ 32 438 393]
Query 33 top 3: [927

### Vector model(TF-IDF)

In [6]:
df_docs = pd.read_csv("documents_data.csv")
df_train = pd.read_csv("train_question.csv")

corpus = list(df_docs['Document_HTML'])
query = list(df_train['Question'])

processed_corpus = [preprocess_text(doc) for doc in corpus]

tokenized_corpus = [doc.split(" ") for doc in processed_corpus]

processed_query = [preprocess_text(q) for q in query]
tokenized_queries = [q.split(" ") for q in processed_query]

tfidf = TFIDFModel()

doc_tfidf_matrix, query_tfidf_matrix = tfidf.fit_transform(
    tokenized_corpus, tokenized_queries)

cosine_sim_matrix = cosine_similarity(query_tfidf_matrix, doc_tfidf_matrix)

top_n = 3
top_doc_indices = np.argsort(cosine_sim_matrix, axis=1)[:, -top_n:][:, ::-1]

hit = 0
for i, indices in enumerate(top_doc_indices):
    print(f"Query {i} top 3 : {indices.tolist()}")
    if i in indices:
        hit += 1
print('Recall@3 accuracy : ', hit/len(top_doc_indices)*100)


Query 0 top 3 : [453, 0, 543]
Query 1 top 3 : [1, 460, 456]
Query 2 top 3 : [689, 674, 2]
Query 3 top 3 : [148, 541, 779]
Query 4 top 3 : [4, 809, 9]
Query 5 top 3 : [5, 583, 163]
Query 6 top 3 : [328, 786, 6]
Query 7 top 3 : [7, 898, 276]
Query 8 top 3 : [8, 573, 200]
Query 9 top 3 : [9, 337, 368]
Query 10 top 3 : [437, 10, 487]
Query 11 top 3 : [11, 965, 848]
Query 12 top 3 : [76, 306, 12]
Query 13 top 3 : [13, 342, 415]
Query 14 top 3 : [14, 712, 398]
Query 15 top 3 : [15, 585, 738]
Query 16 top 3 : [16, 937, 300]
Query 17 top 3 : [618, 17, 419]
Query 18 top 3 : [18, 545, 398]
Query 19 top 3 : [78, 904, 19]
Query 20 top 3 : [20, 942, 485]
Query 21 top 3 : [21, 138, 433]
Query 22 top 3 : [22, 106, 436]
Query 23 top 3 : [23, 340, 869]
Query 24 top 3 : [24, 869, 772]
Query 25 top 3 : [25, 565, 560]
Query 26 top 3 : [26, 969, 23]
Query 27 top 3 : [861, 292, 27]
Query 28 top 3 : [28, 281, 569]
Query 29 top 3 : [29, 786, 131]
Query 30 top 3 : [30, 998, 263]
Query 31 top 3 : [905, 31, 152]

## Inference for test_question.csv

### BM25

In [7]:
df_docs = pd.read_csv("documents_data.csv")
df_test = pd.read_csv("test_question.csv")

corpus = list(df_docs['Document_HTML'])
query = list(df_test['Question'])

processed_corpus = [preprocess_text(doc) for doc in corpus]

tokenized_corpus = [doc.split(" ") for doc in processed_corpus]

bm25 = BM25(tokenized_corpus)
df_result = pd.DataFrame(columns=['index', 'answer'])

for i, q in enumerate(query):
    tokenized_query = q.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(doc_scores)[::-1][:3]
    answers = " ".join(str(idx+1) for idx in top_n_indices.tolist())
    row = {'index': i+1, 'answer': answers}
    df_result = pd.concat([df_result, pd.DataFrame([row])], ignore_index=True)
csv_filename = "output_BM25.csv"
df_result.to_csv(csv_filename, index=False)
print(f"output csv file -> {csv_filename}")


output csv file -> output_BM25.csv


### Vector model(TF-IDF)

In [8]:
df_docs = pd.read_csv("documents_data.csv")
df_test = pd.read_csv("test_question.csv")

corpus = list(df_docs['Document_HTML'])
query = list(df_test['Question'])

processed_corpus = [preprocess_text(doc) for doc in corpus]
tokenized_corpus = [doc.split(" ") for doc in processed_corpus]

processed_query = [preprocess_text(q) for q in query]
tokenized_queries = [q.split(" ") for q in processed_query]

tfidf = TFIDFModel()

doc_tfidf_matrix, query_tfidf_matrix = tfidf.fit_transform(
    tokenized_corpus, tokenized_queries)

cosine_sim_matrix = cosine_similarity(query_tfidf_matrix, doc_tfidf_matrix)

top_n = 3
top_doc_indices = np.argsort(cosine_sim_matrix, axis=1)[:, -top_n:][:, ::-1]

df_result = pd.DataFrame(columns=['index', 'answer'])
for i, indices in enumerate(top_doc_indices):
    answer = " ".join(str(idx+1) for idx in indices.tolist())
    row = {'index': i+1, 'answer': answer}
    df_result = pd.concat(
        [df_result, pd.DataFrame([row])], ignore_index=True)
csv_filename = "output_tfidf.csv"
df_result.to_csv(csv_filename, index=False)
print(f"output csv file -> {csv_filename}")


output csv file -> output_tfidf.csv
