Import tools

In [70]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
import pickle
import os
from operator import itemgetter
from collections import Counter
nltk.download('stopwords')
stemmer = PorterStemmer()

FLAG_TOKENIZE = False

[nltk_data] Downloading package stopwords to /home/atappy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Import data

In [2]:
# Import corpus and queries as a dataframe
corpus = pd.read_json('Data/corpus.jsonl', lines=True)
queries = pd.read_json('Data/queries.jsonl', lines=True)

In [50]:
test_corpus_id = [1396701, 1396704, 1396705, 1396707, 1396708, 1453630, 1605506, 
                  1652605, 1772930, 1772932, 1815965, 1907207, 1972028, 2025485, 
                  2077310, 210379, 210384, 2210591, 2210592, 2210593, 2210595, 
                  2210597, 2232789, 2393704, 2401465, 252142, 2588143, 2627023, 
                  275724, 275726, 275728, 275732, 2872589, 2923493, 2923494, 2923496, 
                  2923497, 2923498, 2923501, 2969050, 3143377, 3203819, 3305804, 3357360, 
                  3385968, 3385971, 3450898, 3485337, 3523599, 3523602, 3538160, 3538161, 
                  3538162, 3538163, 3538164, 3538165, 3538166, 3538167, 3538168, 3545107, 
                  3573471, 3573474, 3573477, 3573478, 3573479, 3918680, 3930148, 4044822, 
                  4245927, 4257001, 4272446, 4279298, 4279299, 4289598, 4378740, 4393631, 
                  4549698, 4566815, 4566816, 4566818, 4566819, 4566822, 4566823, 4656722, 
                  4691129, 4801563, 4834224, 4834226, 4889894, 4910037, 4927817, 4998398, 
                  5007631, 5063813, 5117497, 5359304, 5379124, 5568031, 5652031, 5702414, 
                  5774013, 5774014, 5866302, 5931262, 5931263, 5931264, 5931265, 5931266, 
                  5931267, 5931268, 5931269, 5931270, 5931271, 6142152, 6181638, 6226401, 
                  6260059, 6263225, 6351110, 6462880, 6521993, 6761344, 6761347, 6923052, 
                  6923054, 6923055, 6923056, 6944106, 7082922, 7176430, 7313730, 7313733, 
                  7327683, 7565849, 7643828, 7709320, 7742098, 7816726, 7832420, 7837084, 
                  7837086, 7861747, 7861748, 7861749, 7861750, 7861751, 7861753, 7861755, 
                  7861756, 7885529, 8002846, 8002850, 8002851, 8002852, 8002853, 8002854, 
                  8002855, 8029524, 8029527, 8029530, 8029531, 8029533, 8103063, 8103065, 
                  8103066, 8103067, 8103068, 8178996, 8178997, 8178998, 8179000, 8179001, 
                  8179003, 82105, 82106, 82107, 82108, 82109, 82110, 82111, 82113, 82114, 
                  8304084, 8402970, 8402971, 8402972, 8420020, 8455300, 8455301, 850454, 94879, 98515]

test_corpus_ranks = [0, 0, 1, 0, 2, 0, 3, 0, 0, 0, 2, 1, 2, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 2, 3, 2, 0, 3, 2, 2, 2, 3, 2, 0, 0, 0, 0, 1, 3, 2, 0, 1, 0, 0, 3, 2, 0, 2, 0, 0, 0, 2, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 3, 0, 0, 0, 2, 0, 1, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 2, 2, 0, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, 1, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 1, 0, 2, 2, 0, 0, 2, 2, 2, 3, 1, 2, 3, 2, 2, 2, 2, 2, 0, 3, 2, 2, 1, 2, 3, 1, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 0, 3, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0]

test_corpus_text = corpus[corpus['_id'].isin(test_corpus_id)]
print(test_corpus_text)

test_query_id = 915593

test_query_text = queries[queries['_id']==test_query_id]
print(test_query_text['text'])

             _id                                               text
54646    7861751  The kind of down-to-the-degree control that on...
59650      82110  When to use: These are definitely the best opt...
82850    8304084  5. Cooking the eggs too long. Overcooked poach...
180252   3203819  1 Broil for 1 or 2 minutes until sauce is cook...
212412   3385968  Rather than cooking in boiling water, boiled e...
...          ...                                                ...
1471081  3538161  Fruit cooked sous vide can be enhanced by the ...
1471083  3538162  For the best quality sous vide cooking bag, we...
1471084  3538163  Sous-Vide Glazed Carrots Â». I'm the first to ...
1471090  3538167  How to sous vide watermelon. How to cook fruit...
1471190  2588143  Here are five ways to sous vide food in a cool...

[192 rows x 2 columns]
502964    what types of food can you cook sous vide
Name: text, dtype: object


In [74]:
from rank_bm25 import BM25Okapi
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer data if you haven't already

from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')  # Download the stopwords list if you haven't already

stop_words = set(stopwords.words('english'))

test_documents = [word_tokenize(document) for document in test_corpus_text['text']]
test_documents = [list(filter(lambda x: x.lower() not in stop_words and len(x) > 1, doc)) for doc in test_documents]
test_documents = list(map(lambda x : list(map(lambda y : stemmer.stem(y.lower()), x)), test_documents))
bm25 = BM25Okapi(test_documents)
query = [word_tokenize(document) for document in test_query_text['text']]
query = [list(filter(lambda x: x.lower() not in stop_words and len(x) > 1, doc)) for doc in query]
query = list(map(lambda x : list(map(lambda y : stemmer.stem(y.lower()), x)), query))[0]
scores = bm25.get_scores(query)
print(query)

['type', 'food', 'cook', 'sou', 'vide']


[nltk_data] Downloading package punkt to /home/atappy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/atappy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
sorted_corpus_ids = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
sorted_corpus_ids = [(test_corpus_text.iloc[x[0]]['_id'], test_corpus_ranks[x[0]]) for x in sorted_corpus_ids]
print(sorted_corpus_ids)

[(8179003, 2), (8178998, 3), (3538160, 0), (6923052, 0), (3538162, 0), (3538164, 1), (4566819, 1), (82107, 0), (82111, 2), (4566816, 2), (3523599, 2), (4566823, 0), (82113, 2), (1772930, 3), (1396701, 2), (82105, 2), (7837086, 0), (3538166, 0), (8402970, 0), (2210591, 0), (4566815, 3), (3523602, 2), (2210595, 3), (3357360, 0), (3538167, 0), (82109, 0), (8178996, 2), (2210597, 0), (8455301, 0), (8002852, 0), (3143377, 2), (2923497, 3), (8002855, 1), (82114, 0), (1396707, 2), (8103065, 2), (4834226, 2), (8103067, 3), (8103068, 2), (275732, 0), (7861756, 0), (3485337, 3), (5931266, 2), (3538165, 3), (5774013, 0), (5931265, 2), (7837084, 0), (8029533, 0), (1396704, 3), (7885529, 2), (4910037, 0), (82106, 0), (3385971, 2), (7816726, 2), (5931264, 2), (82110, 0), (8402971, 0), (2923501, 1), (275728, 0), (2588143, 0), (7861749, 3), (8103063, 2), (5866302, 0), (6923054, 0), (5931270, 2), (5931271, 0), (5931269, 0), (4566822, 2), (82108, 1), (1396705, 1), (3538168, 0), (8179000, 2), (8402972, 0