# Step 1 - Import Libraries

Import the required libraries here. You could use additional libraries to help with your implementation.

In [1]:
import re
import os
import copy
import math
import random
import string
import pathlib
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer

from cs589.assignment1.utils.common import save_pickle_file, load_pickle_file, load_text_file

base_path = pathlib.Path("cs589/assignment1/dataset/")
tqdm.pandas()

In [2]:
def split_text(text):
    return text.split()


def load_qids(lang="java"):
    return [qid.strip(string.whitespace) for qid in load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_test_qid.txt"))]


def load_qid_dataframe(lang="java"):
    qid_dataframe = pd.read_csv(base_path / pathlib.Path(f"{lang}/{lang}_cosidf.txt"), 
                                sep="\t", 
                                usecols=["qid1", "qid2", "label"],
                                dtype={"qid1": str, "qid2": str, "label": int})
    return qid_dataframe


def load_corpus(lang="java", verbose=False):
    lines = load_text_file(base_path / pathlib.Path(f"{lang}/{lang}_qid2all.txt"))

    record_list = list()
    for line in tqdm(lines, disable=not verbose):
        record_list.append(
            {name: text.strip(string.whitespace) for name, text in zip(["qid", "title", "question", "answer"], line.split("\t"))}
        )
            
    corpus_dataframe = pd.DataFrame(record_list)

    return corpus_dataframe

In [3]:
# take a look at the corpus
pd.set_option("display.max_columns", 10)

java_corpus_dataframe = load_corpus(lang="java", verbose=True)
print(java_corpus_dataframe.head())

100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 321690.98it/s]


        qid                                     title  \
0  31424546   eclipse mars starts exit code using jdk   
1  31457289  efficient method updating observablelist   
2  16777228                          set title jtable   
3  27262998                  multiple websockets java   
4  46137348      find runtime error nzec java program   

                                            question  \
0  plan moving eclipse mars recently installed bi...   
1  setup mysql database data makeshift server bui...   
2  newbie java wanted set table header jtable tak...   
3  deprecated ok opening connection specific port...   
4  find runtime error nzec java program program r...   

                                              answer  
0  jdk bit download windows x version point vm mi...  
1  need would work keep list instance serverlist ...  
2  define variable containing column names must i...  
3  trying achieve multiple function listen server...  
4  try test code input like probably shall re

# Step 2 - Data Preprocessing

The following cell computes the term frequency (TF) for each word in each component in each StackOverflow question (indexed by the question ID qid).

In [4]:
def get_corpus_tf_dict(corpus_dataframe):
    """ Input: corpus_dataframe, e.g.,    
    
         qid         title                 question          answer
 0  31424546   eclipse mars   eclipse moving eclipse    jdk download   
                                            
        Output: corpus_tf_dict, the term frequency for each word in each component of each question, e.g., 
        {'31424546': {'title': {'eclipse': 1, 'mars': 1},
                      'question': {'moving': 1, 'eclipse': 2},
                      'answer': {'jdk': 1, 'download': 1}}}
    """
    cnt_dataframe = copy.deepcopy(corpus_dataframe)
    for c in ["title", "question", "answer"]:
        cnt_dataframe[c] = cnt_dataframe[c].progress_apply(lambda x: Counter(split_text(x)))

    corpus_tf_dict = cnt_dataframe.set_index("qid").to_dict("index")
    
    return corpus_tf_dict

The following cell computes the document length (dl) of each component in each StackOverflow question (indexed by the question ID qid).

In [5]:
def get_corpus_dl_dict(corpus_dataframe):
    """ Input: corpus_dataframe, e.g.,    
         qid         title                 question          answer
0  31424546   eclipse mars   eclipse moving eclipse    jdk download  

        Output: corpus_dl_dict, the document length for each component from each question, e.g., 
        {'31424546': {'title': 2,
                      'question': 3,
                      'answer': 2}}
    """
    length_dataframe = copy.deepcopy(corpus_dataframe)
    for c in ["title", "question", "answer"]:
        length_dataframe[c] = length_dataframe[c].progress_apply(lambda x: len(split_text(x)))

    corpus_dl_dict = length_dataframe.set_index("qid").to_dict("index")
    
    return corpus_dl_dict

The following cell computes the document frequency (DF) of each word in each StackOverflow question (indexed by the question ID qid). The definition of document frequency is how many document a word appears in, not to be confused with the word's frequency in the entire corpus. For example, the df of "eclipse" below is 2 instead of 3.

In [6]:
def get_corpus_df_dict(corpus_dataframe):
    """ Input: corpus_dataframe, e.g.,    
         qid          title                 question          answer
 0  31424546   eclipse mars   eclipse moving eclipse    jdk download  

        Output: corpus_df_dict, the document length for each component from each question, e.g., 
        {'eclipse': 2, "mars": 1, "moving": 1, "jdk": 1, "download": 1}
    """
    vectorizer = CountVectorizer(binary=True)

    X = vectorizer.fit_transform(corpus_dataframe.title.tolist() + \
                                 corpus_dataframe.question.tolist() + \
                                 corpus_dataframe.answer.tolist())
    corpus_df_dict = {token: doc_freq for token, doc_freq in \
                      zip(vectorizer.get_feature_names(), np.ravel(X.sum(axis=0)))}
 
    return corpus_df_dict

## Saving the Data Preprocessing Result

After computing the TF, DF and dl, cache each of them in a pickle file to be loaded later:

In [7]:
pkl_path = pathlib.Path("pkl/")
if not pkl_path.exists(): pkl_path.mkdir()

def save_preprocessing_results(lang):
    print(f"Processing {lang}...")
        
    lang_pkl_path = pkl_path / lang
    if not lang_pkl_path.exists(): os.mkdir(lang_pkl_path)

    # load corpus and convert corpus to various required data
    corpus_dataframe = load_corpus(lang=lang, verbose=True)

    # obtain the dictionary for the term frequency for each word in each component of each question
    corpus_tf_dict = get_corpus_tf_dict(corpus_dataframe)

    # saving the term frequency dictionary
    save_pickle_file(corpus_tf_dict, f"pkl/{lang}/corpus_tf_dict.pkl")

    # obtain the dictionary for the document length for each component in each question 
    corpus_dl_dict = get_corpus_dl_dict(corpus_dataframe)
    
    # save the document length dictionary
    save_pickle_file(corpus_dl_dict, f"pkl/{lang}/corpus_dl_dict.pkl")

    # obtain the dictionary for the document frequency for each word in the corpus
    corpus_df_dict = get_corpus_df_dict(corpus_dataframe)

    # remove rare words
    corpus_df_dict = {k: v for k, v in corpus_df_dict.items() if v >= 20}

    # save the document frequency dictionary
    save_pickle_file(corpus_df_dict, f"pkl/{lang}/corpus_df_dict.pkl")

    return corpus_tf_dict, corpus_dl_dict, corpus_df_dict

Run the data processing pipeline for the 3 languages:

In [8]:
for lang in ["python", "java", "javascript"]:
     save_preprocessing_results(lang)

Processing python...


100%|██████████████████████████████████████████████████████████████████████| 128500/128500 [00:00<00:00, 365270.08it/s]
100%|██████████████████████████████████████████████████████████████████████| 128500/128500 [00:00<00:00, 222908.43it/s]
100%|███████████████████████████████████████████████████████████████████████| 128500/128500 [00:01<00:00, 73901.05it/s]
100%|███████████████████████████████████████████████████████████████████████| 128500/128500 [00:02<00:00, 63383.70it/s]
100%|██████████████████████████████████████████████████████████████████████| 128500/128500 [00:00<00:00, 450684.02it/s]
100%|██████████████████████████████████████████████████████████████████████| 128500/128500 [00:00<00:00, 190371.39it/s]
100%|██████████████████████████████████████████████████████████████████████| 128500/128500 [00:00<00:00, 181987.40it/s]


Processing java...


100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 283188.72it/s]
100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 216299.54it/s]
100%|███████████████████████████████████████████████████████████████████████| 159263/159263 [00:02<00:00, 61040.11it/s]
100%|███████████████████████████████████████████████████████████████████████| 159263/159263 [00:03<00:00, 51873.43it/s]
100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 508015.29it/s]
100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 200748.07it/s]
100%|██████████████████████████████████████████████████████████████████████| 159263/159263 [00:00<00:00, 173633.46it/s]


Processing javascript...


100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 292241.78it/s]
100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 191971.86it/s]
100%|███████████████████████████████████████████████████████████████████████| 174015/174015 [00:02<00:00, 70274.49it/s]
100%|███████████████████████████████████████████████████████████████████████| 174015/174015 [00:03<00:00, 57171.79it/s]
100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 503102.77it/s]
100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 197627.73it/s]
100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 180327.87it/s]


Create the folder result to store results for Question 4 - 6 if it does not exist.

In [9]:
result_path = pathlib.Path("result")
if not result_path.exists(): result_path.mkdir()

# Step 3 - Implement the TF-IDF and BM25 Algorithms

## Question 1 (30 pts)

Compute the cosine similarity given dictionaries of word count, query_dict and candidate_dict. When working with term frequencies, they are extracted from corpus_tf_dict using corpus_tf_dict[qid][component] syntax).

In [10]:
def compute_cosine_similarity(query_tf_dict, 
                              candidate_tf_dict):
    """ Input: query_tf_dict: a dict of word and its term frequency in query document, e.g.
               {"i": 1, "love": 1, "python": 1}
               candidate_tf_dict: a dict of word and its term frequency in the candidate document, e.g.
               {"i": 1, "like": 1, "c++": 1}
        Output: score: cosine similary between query and candidate documents
                0.33333333333333337
                
    """

    score = 0
    #############################################START HERE#############################################
    # Question 1 (30 pts)
    
    # Converting dictionary values into numpy arrays
    # according to dictionary keys i.e. terms
    query_tf_dict_values = list(query_tf_dict.keys())
    arr1 = np.array(query_tf_dict_values)
    
    candidate_tf_dict_values = list(candidate_tf_dict.keys())
    arr2 = np.array(candidate_tf_dict_values)
    
    # Count the number of occurances
    c1 = Counter(arr1)
    c2 = Counter(arr2)
    
    # all items
    all_items = set(c1.keys()).union(set(c2.keys()))
    
    x = [c1[k] for k in all_items]
    y = [c2[k] for k in all_items]
    
    # calculate cosine similarity
    # dot product of the two arrays divided by
    # the multiplication of the norm of individual arrays
    score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
    
    # the score calculated gives a runtime warning of
    # invalid value encountered in double_scalars
    
    # this happens because there are mathematical operations that
    # involve small or large multiplication
    # however, this doesn't affect the output
    # kindly check the output Q4.txt


    ##############################################END HERE##############################################
    return score

Test your compute_cosine_similarity implementation on the Python corpus when retrieving candidate's title using query's title.

In [11]:
lang = "python"

corpus_tf_dict = load_pickle_file(f"pkl/{lang}/corpus_tf_dict.pkl")
qid_dataframe = load_qid_dataframe(f"{lang}")

result_dict = dict()
for qid1, qid2 in list(qid_dataframe[["qid1", "qid2"]].to_records(index=False)):
    result_dict[(qid1, qid2)] = compute_cosine_similarity(corpus_tf_dict[qid1]["title"],
                                                          corpus_tf_dict[qid2]["title"])


result_filename = pathlib.Path("result/Q4.txt")
if result_filename.exists(): os.remove(result_filename)

with open(result_filename, "a") as fp:
    fp.write("qid1\tqid2\tscore\n")
    for (qid1, qid2), score in result_dict.items():
        fp.write(f"{qid1}\t{qid2}\t{score}\n")

  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))


## Question 2 (30 pts)

Compute the TF-IDF score of each word in document_tf_dict and store it in the document_word_tfidf_dict.

For the total number of documents N, as our LinkSO dataset is scraped from the StackOverflow website, it is a small sample of the entire pool of posts, and the exact number of posts is constantly changing (see real-time statistics here for all topics). For the sake of this assignment, we could set the total number of posts to a constant, for example, N = 10 ** 6, as an approximation.

Notice the example provided as docstring is used to help you understand the input and output data structures. You are not expected to reproduce the numbers exactly.

In [12]:
def compute_document_tfidf(document_tf_dict, 
                           corpus_df_dict):
    """ Input: document_tf_dict: a dict of word and its term frequency in document
               {"i": 1, "love": 1, "python": 1}
               corpus_df_dict: a dict of word and its document frequencey in the entire corpus
               {"i": 2, "you": 1, "we": 3, "love": 1, "like": 1, "hate": 2, "python": 5, "c++": 3}
        Output: document_word_tfidf_dict: a dict of word and its TF-IDF score in the document
               {'i': 13.592366256649782, 'love': 14.103192380416024, 'python': 12.803907396283263}
    """

    document_word_tfidf_dict = dict()
    #############################################START HERE#############################################
    # Question 2 (30 pts)
    # copy dictionary keys from tf_dict to tfidf_dict
    document_word_tfidf_dict = dict.fromkeys(document_tf_dict)
    
    # Corpus length
    # N = len(corpus_tf_dict)
    N = 10 ** 6 # updated N as advised by Professor
    
    for key, value in document_word_tfidf_dict.items():
        # caluclate tf = count of t in d / no. of words in d
        tf = document_tf_dict[key] / sum(document_tf_dict.values())
        
        # calculate idf = log10(corpus length/(document frequency + 1))
        # handles words in document but not in corpus
        # using try and except
        
        # there might be some words in document that might not be in 
        # the corpus. we can mitigate with using try and except
        
        # use math.log10(N/(corpus_df_dict[key]+1)) when word in corpus
        # if not, use the simplified form i.e. math.log10(N)
        
        # log base 10 used as all the formulations in the slide and
        # other sources from the internet used log base 10
        # also since N = 10^6, it is better to used log base 10
        
        try:
            idf = math.log10(N/(corpus_df_dict[key]+1))
        except:
            idf = math.log10(N)
            
        # calculate tf-idf
        document_word_tfidf_dict[key] = tf * idf
        


    ##############################################END HERE##############################################
    
    return document_word_tfidf_dict

Test your compute_document_tfidf implementation on the title component of the Java corpus.

In [13]:
lang = "java"

corpus_tf_dict = load_pickle_file(f"pkl/{lang}/corpus_tf_dict.pkl")
corpus_df_dict = load_pickle_file(f"pkl/{lang}/corpus_df_dict.pkl")
qid_dataframe = load_qid_dataframe(f"{lang}")

result_dict = dict()
for qid1 in qid_dataframe.qid1.tolist():
    result_dict[qid1] = compute_document_tfidf(corpus_tf_dict[qid1]["title"],
                                               corpus_df_dict)

result_filename = pathlib.Path("result/Q5.txt")
if result_filename.exists(): os.remove(result_filename)

with open(result_filename, "a") as fp:
    fp.write("qid1\ttoken\ttfidf\n")
    for qid1, d in result_dict.items():
        for token, score in d.items():
            fp.write(f"{qid1}\t{token}\t{score}\n")

## Question 3 (40 pts)

Compute the BM25 score between query_tf_dict and candidate_tf_dict. N = 10 ** 6 following Question 2.

Notice the example provided as docstring is used to help you understand the input and output data structures. You are not expected to reproduce the numbers exactly.

In [14]:
def compute_document_bm25(query_tf_dict, 
                          candidate_tf_dict, 
                          corpus_df_dict,
                          candidate_length,
                          avgdl):
    """ Input: query_tf_dict: a dict of word and its term frequency in query document
               {"i": 1, "love": 1, "python": 1}     
               candidate_tf_dict:a dict of word and its term frequency in candidate document
               {"i": 1, "like": 1, "c++": 1}
               corpus_df_dict: a dict of word and its document frequencey in the entire corpus
               {"i": 2, "you": 1, "we": 3, "love": 1, "like": 1, "hate": 2, "python": 5, "c++": 3}
               candidate_length: number of words in candidate document
               3
               avgdl: average document length in the entire corpus
               4
       Output: score: BM25 score between query and candidate
               15.816571644101565
    """


    # hyperparameters for BM25 algorithm
    k1, b = 3, 0.75

    score = 0
    #############################################START HERE#############################################
    # Question 3 (40 pts)
    # N = len(corpus_tf_dict)
    N = 10 ** 6
    
    # calculate idf for each word in corpus
    # store the result in a dict named idf
    idf = {}
    
    for term, freq in corpus_df_dict.items():
        # calculate idf
        idf[term] = math.log10(1 + (N - freq + 0.5) / (freq + 0.5))
    
    for term in query_tf_dict:
        if term not in candidate_tf_dict:
            continue
        frequency = candidate_tf_dict[term]
        
        # similar to q2 try and except however in this we check something different
        # if term not in candidate document, the frequency will be 0
        # making the numerator 0
        try:
            numerator = idf[term] * frequency * (k1 + 1)
            denominator = frequency + k1 * (1 - b + b * candidate_length / avgdl)
        except:
            numerator = 0
            denominator = frequency + k1 * (1 - b + b * candidate_length / avgdl)
        
        score += (numerator / denominator)
    
    
    ##############################################END HERE##############################################

    return score


Test your compute_document_bm25 implementation on the title component of the JavaScript corpus.

In [15]:
lang = "javascript"

corpus_tf_dict = load_pickle_file(f"pkl/{lang}/corpus_tf_dict.pkl")
corpus_df_dict = load_pickle_file(f"pkl/{lang}/corpus_df_dict.pkl")
corpus_dl_dict = load_pickle_file(f"pkl/{lang}/corpus_dl_dict.pkl")

qid_dataframe = load_qid_dataframe(f"{lang}")

corpus_dataframe = load_corpus(lang=lang, verbose=True)
avgdl = corpus_dataframe["title"].apply(lambda x: len(split_text(x))).sum() / len(corpus_dataframe)

result_dict = dict()
for qid1, qid2 in list(qid_dataframe[["qid1", "qid2"]].to_records(index=False)):
    result_dict[(qid1, qid2)] = compute_document_bm25(corpus_tf_dict[qid1]["title"],
                                                      corpus_tf_dict[qid2]["title"],
                                                      corpus_df_dict,
                                                      corpus_dl_dict[qid2]["title"],
                                                      avgdl)


result_filename = pathlib.Path("result/Q6.txt")
if result_filename.exists(): os.remove(result_filename)

with open(result_filename, "a") as fp:
    fp.write("qid1\tqid2\tscore\n")
    for (qid1, qid2), score in result_dict.items():
        fp.write(f"{qid1}\t{qid2}\t{score}\n")

100%|██████████████████████████████████████████████████████████████████████| 174015/174015 [00:00<00:00, 351844.14it/s]


## Running Your Ranking Algorithms

The function run_retrieval_algorithm puts your implementations (compute_cosine_similarity, compute_document_tfidf, and compute_document_bm25) together and apply them to the entire dataset. Even though the code has been provided, it is recommended to read it to get a sense of how the retrieval pipeline works.

In [16]:
base_path = pathlib.Path("cs589/assignment1/dataset/")

def run_retrieval_algorithm(lang, algo, component, qid1s=None):
    corpus_tf_dict = load_pickle_file(f"pkl/{lang}/corpus_tf_dict.pkl")
    corpus_dl_dict = load_pickle_file(f"pkl/{lang}/corpus_dl_dict.pkl")
    corpus_df_dict = load_pickle_file(f"pkl/{lang}/corpus_df_dict.pkl")

    corpus_dataframe = load_corpus(lang=lang, verbose=False)
    available_ids = corpus_dataframe.qid.unique()
    avgdl = corpus_dataframe[component].apply(lambda x: len(split_text(x))).sum() / len(corpus_dataframe)

    qid1s = qid1s if qid1s != None else load_qids(lang=lang)
    qid1_dataframe = load_qid_dataframe(lang=lang)
    
    result_folder = pathlib.Path("result/")
    if not result_folder.exists(): result_folder.mkdir()

    result_filename = pathlib.Path(f"result/{lang}_{algo}_{component}.txt")

    # remove existing result file
    if result_filename.exists():
        os.remove(result_filename)

    # write header
    with open(result_filename, "a") as fp:
        fp.write("qid1\tqid2\tscore\tlabel\n")
    
    for qid1 in tqdm(qid1s):
        if qid1 not in available_ids: continue

        cond1 = qid1_dataframe.qid1 == qid1
        cond2 = qid1_dataframe.label == 1

        qid2s = qid1_dataframe[cond1].qid2.tolist()
        qid2s_linked = qid1_dataframe[cond1 & cond2].qid2.tolist()

        qid1_tf_dict = corpus_tf_dict[qid1]["title"]
        query_result = dict()

        # only for BM25
        max_bm25 = -1
        for qid2 in qid2s:
            if qid2 not in available_ids: continue

            qid2_tf_dict = corpus_tf_dict[qid2][component]

            # tfidf
            if algo == "tfidf":
                score = compute_cosine_similarity(compute_document_tfidf(qid1_tf_dict, corpus_df_dict),
                                                  compute_document_tfidf(qid2_tf_dict, corpus_df_dict))
            
            # bm25
            if algo == "bm25":
                candidate_length = corpus_dl_dict[qid2][component]
                score = compute_document_bm25(qid1_tf_dict, 
                                              qid2_tf_dict, 
                                              corpus_df_dict,
                                              candidate_length,
                                              avgdl)
                
                max_bm25 = max(score, max_bm25)
            
            query_result[qid2] = score
        
        # adjust BM25 score
        if (algo == "bm25") and (max_bm25 != 0):
            query_result = {qid: score / max_bm25 for qid, score in query_result.items()}
        
        qid2s_sorted = sorted(query_result, key=query_result.get, reverse=True)

        with open(result_filename, "a") as fp:
            for qid2 in qid2s_sorted:
                label = 1 if qid2 in qid2s_linked else 0
                score = query_result[qid2]
                
                fp.write(f"{qid1}\t{qid2}\t{score}\t{label}\n")

Run the retrieval algorithms and save the ranking results for each language and each retrieval algorithms:

In [17]:
langs = ["python", "java", "javascript"]
algos = ["bm25", "tfidf"]
components = ["title", "question", "answer"]

for lang, algo, component in itertools.product(langs, algos, components):
    print(f"Running {algo} on {lang}'s {component}...")
    run_retrieval_algorithm(lang, algo, component)

Running bm25 on python's title...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [30:50<00:00,  1.85s/it]


Running bm25 on python's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [30:33<00:00,  1.83s/it]


Running bm25 on python's answer...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [30:30<00:00,  1.83s/it]


Running tfidf on python's title...


  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:42<00:00,  4.50it/s]


Running tfidf on python's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:07<00:00,  4.04it/s]


Running tfidf on python's answer...


  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:48<00:00,  4.37it/s]


Running bm25 on java's title...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [36:10<00:00,  2.17s/it]


Running bm25 on java's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [36:05<00:00,  2.17s/it]


Running bm25 on java's answer...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [36:11<00:00,  2.17s/it]


Running tfidf on java's title...


  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:29<00:00,  3.71it/s]


Running tfidf on java's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:38<00:00,  3.59it/s]


Running tfidf on java's answer...


  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:37<00:00,  3.61it/s]


Running bm25 on javascript's title...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [33:35<00:00,  2.02s/it]


Running bm25 on javascript's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [33:43<00:00,  2.02s/it]


Running bm25 on javascript's answer...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [33:33<00:00,  2.01s/it]


Running tfidf on javascript's title...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:55<00:00,  3.38it/s]


Running tfidf on javascript's question...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:53<00:00,  3.41it/s]


Running tfidf on javascript's answer...


  score = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:54<00:00,  3.39it/s]
