In [1]:
#importing dependencies
import numpy as np
import os
import pickle as pkl
from urllib.request import urlopen
from bs4 import BeautifulSoup
import math
from math import log2
import utils

from base_classes.load_train_data import load_train_data
from base_classes.query import Query
from base_classes.document import Document

In [2]:
base_dir_name = os.getcwd()
data_dir_name = "project_data"
data_dir = os.path.join(base_dir_name, data_dir_name)

for item in os.listdir(data_dir):
    if not item.startswith('.') and os.path.isfile(os.path.join(data_dir, item)):
        print(item)

pa3.rel.train
BSBI.dict
docs.dict
pa3.rel.dev
pa3.signal.dev
terms.dict
pa3.signal.train


# The new plumbing

Given a Query object query, query_dict[query] maps to a dictionary url_dict.
url_dict maps urls to Documet objects.
Documents will store the headers, title, body_content (both as lists of word_ids, which can be mapped to strings)
as well as relevance to the query.

Note! Do not make Query's from words. Using the query mapping to get a Query obj from words, and words from a Query obj

First load up the objects from the web scraping, as they define body_content and an initial vocabulary.

In [3]:
# These were built by the web scraping process
vocab_id_list = pkl.load(open('vocab_id_list.p', 'rb'))
vocab_dict = pkl.load(open('vocab_dict.p', 'rb'))
vocab_frequency = pkl.load(open('vocab_frequency.p', 'rb'))
docId_to_content = pkl.load(open('doc_id_content.p', 'rb'))
doc_id_list = pkl.load(open("doc_id_list.p", "rb"))
doc_dict = pkl.load(open("doc_dict.p", "rb"))

file_name = os.path.join(data_dir, "pa3.signal.train")
query_dict = load_train_data(file_name)


In [4]:
# use to map from word_ids (integers) to words (strings) and vice versa 
class WordIDMap:
    def __init__(self, vocab_string_list=None, vocab_dict=None):
        self.vocab_string_list = []
        self.vocab_dict = {}
        if vocab_string_list and vocab_dict:
            self.vocab_string_list = vocab_string_list
            self.vocab_dict = vocab_dict
    
    def add_string(self, string):
        if string not in self.vocab_dict:
            self.vocab_dict[string] = len(self.vocab_string_list)
            self.vocab_string_list.append(string)
        
    def get_string(self, ID):
        assert ID < len(self.vocab_string_list)
        return self.vocab_string_list[ID]
        
    def get_id(self, string):
        if string not in self.vocab_dict:
            self.add_string(string) # this way we don't have to explicitly add query/doc title words to the mapping
        return self.vocab_dict[string]
        
    def __len__(self):
        return len(self.vocab_string_list)

In [5]:
# instantiate the WordIDMap with the words scraped from the stanford domain
vocab_id_list = pkl.load(open('vocab_id_list.p', 'rb'))
vocab_dict = pkl.load(open('vocab_dict.p', 'rb'))
word_map = WordIDMap(vocab_id_list, vocab_dict)

before_length = len(word_map)
print(word_map.get_string(0))


stanford


Read the relevance scores from the train set, and add them to the documents within the query_dict.

In [6]:
def get_rel_scores(query_dict, filename):
    urls_missing = []
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith("q"):
                query = line.split(":")[-1].strip()
                query = Query(query)
                
                 # assume the query is the one we want (otherwise, would be a duplicate)
                query_in_dict = True
            else: #urls
                tokens = line[line.index(":")+1:].strip().split(" ")
                url = tokens[0]
                rel = tokens[1]
                if float(rel) < 0:
                     rel = 0
                if url in query_dict[query] and query_in_dict:   
                    document = query_dict[query][url]
                    document.relevance = float(rel)
                else:
                    # we know this query is wrong, so wait til a new query comes along
                    query_in_dict = False
                    urls_missing.append((query,url))
                    
    return urls_missing

def get_relevance_dict(query_dict, query):
    return {url: document.relevance for url, document in query_dict[query].items()}
             
relevance_filenames = os.path.join(data_dir, 'pa3.rel.train')
urls_missing = get_rel_scores(query_dict, relevance_filenames)

for query, url_dict in query_dict.items():
    for url, doc in url_dict.items():
        assert doc.relevance is not None


In [7]:
urls_missing

[(cardinal nights, 'http://events.stanford.edu/events/453/45363/'),
 (cardinal nights, 'https://alcohol.stanford.edu/cardinal-nights'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/cardinal-nights-comedic-guest'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/cardinal-nights-goosebumps-screening'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/cardinal-nights-midnight-premiere-hunger-games-mockingjay-part-1'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/cardinal-nights-mini-grant-casino-night'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/cardinal-nights-pick-your-own-movie-night-0'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/meet-staff-cardinal-nights'),
 (cardinal nights,
  'https://alcohol.stanford.edu/events/outdoor-ed-and-cardinal-nights-presents-dive-movie'),
 (cardinal nights, 'https://events.stanford.edu/events/618/61821/'),
 (arrillaga gym hours,
  'http://ortho.stanford.edu/lacob/sports-medicine-

In [8]:
# convert the title/headers to ids instead of words
def convert_to_ids(list_of_words):
    return [word_map.get_id(word) for word in list_of_words]

for query, url_dict in query_dict.items():
    for url, document in url_dict.items():
        if document.title:
            document.title = convert_to_ids(document.title)
        
        if document.headers:
            document.headers = [convert_to_ids(header) for header in document.headers]
            
        doc_id = doc_dict[url]
        if doc_id in docId_to_content:
            _, document.body_content = docId_to_content[doc_id]
        else:
            document.body_content = None

In [9]:
# print a few example relevances
num_dicts = 0
for query in query_dict:
    print(get_relevance_dict(query_dict, query))
    print()
    num_dicts += 1
    if num_dicts == 5:
        break

{'http://events.stanford.edu/2014/February/18/': 0.0, 'http://events.stanford.edu/2014/February/6/': 0.0, 'http://events.stanford.edu/2014/March/13/': 0.0, 'http://events.stanford.edu/2014/March/3/': 0.0, 'http://med.stanford.edu/content/dam/sm/hip/documents/FreeFitnessWeek.pdf': 0.0, 'http://web.stanford.edu/group/masters/pool.html': 1.0, 'https://alumni.stanford.edu/get/page/perks/PoolAndGyms': 1.5, 'https://cardinalrec.stanford.edu/facilities/aoerc/': 2.0, 'https://explorecourses.stanford.edu/search?view=catalog&filter-coursestatus-Active=on&page=0&catalog=&q=PE+128%3A+Swimming%3A+Beginning+I&collapse=': 0.5, 'https://glo.stanford.edu/events/stanford-rec-open-house': 0.5}

{'http://alumni.stanford.edu/get/page/membership/benefits/creditcard': 2.0, 'http://alumni.stanford.edu/get/page/membership/benefits/libraries': 2.0, 'http://alumni.stanford.edu/get/page/membership/students': 2.0, 'https://alumni-esc.stanford.edu/get/page/membership/faq-general': 2.0, 'https://alumni.stanford.edu/

# Embeddings


In [10]:
'''having generated query content and doc content, lets try ranking by cosine similarity between query and document 
embedding
'''
#iteration 1: ignore words not there in the embedding

#lookup function
GLOVE_HOME = os.path.join('data', 'glove.6B')
glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.100d.txt'))

In [11]:
def make_glove_embedding(words):
    for word in words:
        assert isinstance(word, str), (type(word), word)
    allvecs = np.array([glove_lookup[w] for w in words if w in glove_lookup]) 

    if len(allvecs) == 0:
        dim = len(next(iter(glove_lookup.values())))
        feats = np.zeros(dim)    
    else:       
        feats = np.mean(allvecs, axis=0) 
    
    return feats

def query_and_document_embeddings(query, doc_content_type='title'):
    query_embedding = make_glove_embedding(query)
    
    document_entries = query_dict[query]
    
    def get_doc_words(document):
        title_words = [word_map.get_string(word_id) for word_id in document.title]
        
        if doc_content_type == 'title':
            return title_words
        
        elif doc_content_type == 'headers':
            if not document.headers:
                return title_words
            words = []
            for header in document.headers:
                words += [word_map.get_string(word_id) for word_id in header]
            return words
        
        if doc_content_type == 'th': # use title and header combo
            if not document.headers:
                return title_words
            words = 2 * title_words
            for header in document.headers:
                words += [word_map.get_string(word_id) for word_id in header]
            return words
        
        elif doc_content_type == 'body_hits':
            if not document.body_hits:
                return title_words
            
            words = []
            for query_word, hits in document.body_hits.items():
                words += len(hits) * [query_word]
            return words
        
        elif doc_content_type == 'body_content':
            if not document.body_content:
                return title_words
            return [word_map.get_string(word_id) for word_id in document.body_content]
        
        else:
            raise ValueError("Invalid doc content: {}".format(doc_content_type))
    
    document_tuples = [(url, get_doc_words(document)) for url, document in document_entries.items()]
    document_embeddings = [(url, make_glove_embedding(words)) for url, words in document_tuples]
    return query_embedding, document_embeddings

def cosine_similarity(doc_embedding, query_embedding):
    norm = np.sqrt(np.sum(np.square(doc_embedding))) # normalize the documents, but not the queries
    if norm > 0:
        doc_embedding /= norm
    return np.dot(doc_embedding, query_embedding)


In [12]:
def DCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a DCG score for the retrieval/ranking.
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        DCG [float]'''
    
    discount_factor = []
    relevance_scores = []
    for i, doc in enumerate(ranked_docs):
        rank = i+1
        if rank == 1:
            discount_factor.append(1)
        else:
            discount_factor.append(log2(rank))
        relevance_scores.append(relevance_dict[doc])
    
    return sum(np.array(relevance_scores)/np.array(discount_factor))
            
def NDCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a NDCG score for the ranking. 
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        NDCG [float]'''
    ideal_ordering, _ = zip(*sorted(relevance_dict.items(), key = lambda x: (-x[1])))
    ideal_ordering = list(ideal_ordering)
    DCG_oracle = DCG(ideal_ordering, relevance_dict)
    DCG_case = DCG(ranked_docs, relevance_dict)
    
    #return 0 if DCG_ideal is 0 (happens when all the retrieved docs are rated 0)
    if DCG_oracle == 0:
        return 1.0

    return DCG_case/DCG_oracle

#sanity check
ranked_docs = [0,1,2,3,4]
relevance_dict = {0: 2, 1: 3, 2: 0, 3: 0, 4: 1}
DCG_score = 2 + 3/log2(2) + 1/log2(5)
Ideal_score = 3 + 2/log2(2) + 1/log2(3)
NDCG_score = DCG_score/Ideal_score
assert DCG(ranked_docs, relevance_dict) == DCG_score, "DCG error"
assert NDCG(ranked_docs, relevance_dict) == NDCG_score, "NDCG error"

In [13]:
def average_precision_helper(relevance_list):
    precision = 0.0
    relevant_so_far = 0.0
    for i, val in enumerate(relevance_list):
        relevant_so_far += val
        precision += relevant_so_far / (i+1)
    return precision / len(relevance_list)

def average_precision(ranked_doc_list, query_relevance_dict):
    relevance_list = [1 if query_relevance_dict[doc] >= 1.0 else 0 for doc in ranked_doc_list]
    return average_precision_helper(relevance_list)

expected = (1 + 1 + 2/3 + 2/4 + 3/5 + 3/6 + 4/7)/ 7
actual = average_precision_helper([1, 1, 0, 0, 1, 0, 1])
assert expected == actual, actual


In [14]:
def run_metrics(content_type='title', scoring_func=cosine_similarity):

    ndcg_sum = 0.0
    precision_sum = 0.0
    for query in query_dict:
        query_relevance_dict = get_relevance_dict(query_dict, query)
        
        query_embedding, document_embeddings = query_and_document_embeddings(query, content_type)
        
        scores = [(url, scoring_func(doc_emb, query_embedding)) for url, doc_emb in document_embeddings]
        scores = sorted(scores, key = lambda x: x[1], reverse=True)
        ranked_doc_list, _ = zip(*scores)
        ranked_doc_list = list(ranked_doc_list)

        ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict)
        precision_sum += average_precision(ranked_doc_list, query_relevance_dict)
        
    ndcg_sum = ndcg_sum / len(query_dict)
    precision_sum = precision_sum / len(query_dict)
    return {'NDCG': ndcg_sum,
            'MAP': precision_sum}


In [15]:
def random_similarity(doc_embedding, query_embedding):
    return np.random.uniform()

In [16]:
run_metrics(scoring_func=random_similarity)

{'NDCG': 0.80535469840465, 'MAP': 0.7341629020255825}

In [17]:
run_metrics(content_type='headers')

{'NDCG': 0.7998466704959435, 'MAP': 0.7260548526042633}

In [18]:
run_metrics(content_type='title')

{'NDCG': 0.8058287842182414, 'MAP': 0.7282281456941095}

In [19]:
run_metrics(content_type='th') # use combo of title and headers as in old code version

{'NDCG': 0.8033349213913299, 'MAP': 0.7292918295655904}

In [20]:
run_metrics(content_type='body_hits')

{'NDCG': 0.8064515680022327, 'MAP': 0.7278642805097923}

In [21]:
run_metrics(content_type='body_content')

{'NDCG': 0.842848133595507, 'MAP': 0.7634415501773494}

### Analyzing data

In [50]:
'''
BSBI.dict gives the:
posting_dict - 
        [dict mapping termID to (start position in the index file, 
                                number of postings in the list, 
                                length in bytes of posting list)] 
and termID - [list]
'''
with open(os.path.join(data_dir, "BSBI.dict"), "rb") as f:
    posting_dict, termsID = pkl.load(f)

In [51]:
print("Printing signal and relevance train files...")

print("\nSignal File")
query_dict = {} #maps queries to query id (Assuming distinct queries)
doc_dict = {}  #maps docs to doc id
query_doc_dict = {} #maps query ids to list of doc ids

query_id_list = [] #list
doc_id_list = []

doc_list_for_query = []

query_repetitions = {} #dict mapping queries to number of repetitions
query_counter = 0
doc_repetitions = 0

with open(os.path.join(data_dir, "pa3.signal.train"), "r", encoding='utf8') as f:
    last_query_id = 0 
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            if query_counter >= 2:
                query_doc_dict[last_query_id] = doc_list_for_query
                
            query = " ".join(line_list[1:])
            
            if query_dict.get(query, None) != None:
                query_repetitions[query] = query_repetitions.get(query, 0) + 1
                query = query + " _" + str(query_repetitions[query])
            
            query_id_list.append(query)
            query_dict[query] = len(query_id_list) - 1
            
            #query_doc_dict[last_query_id] = doc_list_for_query
            last_query_id = len(query_id_list) - 1 #update the last query whenever a new query starts
            doc_list_for_query = [] #reinitialize the doc list whenever a new query starts
        
        elif line_list[0] == 'url:':
            assert len(line_list) == 2, "line_list for url has more than 2 entries. Please check!"
            doc = line_list[1]
            if doc_dict.get(doc, None) == None:
                doc_id_list.append(doc)
                doc_id = len(doc_id_list) -1
                doc_dict[doc] = doc_id
            else:
                doc_id = doc_dict[doc]
                
            doc_list_for_query.append(doc_id)
        
        else:
            continue
    
    query_doc_dict[last_query_id] = doc_list_for_query
            
print(query_counter)
print(len(query_dict))
print(len(query_doc_dict))
    
print("\n" + "--"*10 + "\n")

import copy
query_total_repetitions = copy.deepcopy(query_repetitions)
query_doc_relevance = {}
doc_relevance_dict = {}
query_counter = 0
print("\nRelevance File")
with open(os.path.join(data_dir, "pa3.rel.train"), "r", encoding='utf8') as f:
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            query = " ".join(line_list[1:])
            if query_repetitions.get(query, None) != None:
                query_repetition_number = query_total_repetitions[query] - query_repetitions[query]
                query_repetitions[query] -= 1
                if query_repetition_number != 0:
                    query = query + " _" + str(query_repetition_number)
            
            if query_counter >= 2:
                assert query_doc_relevance.get(last_query_id, None) == None, "Query already existed in the relevance dict"
                query_doc_relevance[last_query_id] = doc_relevance_dict
            
            last_query_id = query_dict[query]
            doc_relevance_dict = {}
            
        elif line_list[0] == "url:":
            doc = line_list[1]
            docID = doc_dict[doc]
            
            doc_relevance_dict[docID] = float(line_list[-1].strip())
    
    query_doc_relevance[last_query_id] = doc_relevance_dict

print(query_counter)
print(len(query_doc_relevance))

Printing signal and relevance train files...

Signal File
749
749
749

--------------------


Relevance File
749
749


In [54]:
pkl.dump(doc_dict, open("doc_dict.p", "wb" ))

In [55]:
sample_query = query_id_list[0]
sample_query_id = query_dict[sample_query]
retrieved_docIds = query_doc_dict[sample_query_id]
retrieved_relevance = query_doc_relevance[sample_query_id]
retrieved_docs = [doc_id_list[docId] for docId in retrieved_docIds]

print("Samples", end = "\n\n" + "."*10 + "\n")
print("Sample Query: ", sample_query, end = "\n\n" + "-"*10 + "\n")
print("Sample Query Id: ", sample_query_id, end = "\n\n" + "-"*10 + "\n")
print("Query Repetitions", query_total_repetitions, end = "\n\n" + "-"*10 + "\n")
print("Manipulated query repetitions: ", query_repetitions, end = "\n\n" + "-"*10 + "\n")
print("Retrieved doc Ids: ", retrieved_docIds, end = "\n\n" + "-"*10 + "\n")
print("Retrieved doc relevance: ", retrieved_relevance, end = "\n\n" + "-"*10 + "\n")
print("Retrieved docs: ", retrieved_docs, end = "\n\n" + "-"*10 + "\n")

Samples

..........
Sample Query:  stanford aoerc pool hours

----------
Sample Query Id:  0

----------
Query Repetitions {'facility hours': 1, 'stanford dining hours': 1, 'stanford bookstore': 1, 'aoerc pool hours': 1, 'arrillaga gym hours': 1, 'lakeside dining hours': 1, 'marguerite schedule': 1, 'computer science': 1, 'commencement schedule': 1, 'parking permit': 1, 'bookstore': 1, 'green library hours': 1, 'memorial church': 1, 'dining hall hours': 1, 'cardinal nights': 1, 'swimming pool hours': 1, 'bechtel international center': 1, 'stanford visitor parking': 1}

----------
Manipulated query repetitions:  {'facility hours': -1, 'stanford dining hours': -1, 'stanford bookstore': -1, 'aoerc pool hours': -1, 'arrillaga gym hours': -1, 'lakeside dining hours': -1, 'marguerite schedule': -1, 'computer science': -1, 'commencement schedule': -1, 'parking permit': -1, 'bookstore': -1, 'green library hours': -1, 'memorial church': -1, 'dining hall hours': -1, 'cardinal nights': -1, 'swimm

## Data Distribution Analysis

In [56]:
#number of unique queries
n_queries = len(query_dict)
n_unique_queries = len(query_dict)
for repeated_query in query_total_repetitions:
    n_unique_queries -= query_total_repetitions[repeated_query]


print("Query Analysis\n")
print("Total queries: ", n_queries)
print("Number of unique queries: ", n_unique_queries)
print("% of unique queries: ", (n_unique_queries/n_queries)*100)
print("\n" + "-"*10 + "\n")

print("Retrieved doc Analysis\n")
total_docs = len(doc_list_for_query) * n_queries
print("Number of unique docs: ", len(doc_id_list))
print("Docs per query: ", len(doc_list_for_query))
print("% of unique docs: ", (len(doc_id_list)/(len(doc_list_for_query) * n_queries))*100)
print("\n" + "-"*10 + "\n")

print("Label Analysis\n")
label_set = []
for _, relavance_dict in query_doc_relevance.items():
    _, labels = zip(*tuple(relavance_dict.items()))
    labels = list(labels)
    label_set += labels
label_set = set(label_set)
print("Set of labels: ", label_set)

Query Analysis

Total queries:  749
Number of unique queries:  731
% of unique queries:  97.59679572763686

----------

Retrieved doc Analysis

Number of unique docs:  5256
Docs per query:  10
% of unique docs:  70.173564753004

----------

Label Analysis

Set of labels:  {0.0, 1.0, 2.0, 1.5, 0.5, 3.0, 2.5, 2.3, 1.7, 1.3, 2.7, 2.2, 2.8, 0.3, 0.7, 1.8}


### Metric - NDCG 

can also incorporate Precision, MAP, etc. after binary conversion with decay rates

## Random ordering accuracy

For every query, arrange the docs in random order and check the NDCG value

In [58]:
#randomly shuffles the docs
ndcg_sum = 0.0
for queryID, doc_list in query_doc_dict.items():
    np.random.shuffle(doc_list)
    ndcg_sum += NDCG(doc_list, query_doc_relevance[queryID])
print(ndcg_sum)
print(len(query_id_list))
print(ndcg_sum/len(query_id_list))

594.1691182892581
749
0.7932832019883286


## Comparing query and document embeddings

Document embeddings are obtained from the given title or header information without any weight normalization. Loop through the files and collect doc words by looking in the title and header (one idea can be to give more weight to title than to header). Lookup for each word in the glove embedding. Choose and fix a random combination of word if a word in query does not exist (maybe a combination from the words university and around because the corpus relates to stanford). Ignore otherwise. Finally, find cosine similarity and rank and compute NDCG score. 

#### Other ideas:
1. Treat upper case and start of line word different than end of line word, etc
2. Can add word correction, etc
3. How scraping documents and adding more words to document effect performance
4. Modeling item-item dependency by seq2slate architecture
5. Creating embedding for words in the query but not in the embedding vocab as a distinct combination for 
6. Training word2vec on this and then trying different ideas with the center and context matrices obtained
7. DESM type ideas with the embeddings of words in the document weighted by the similarity of words (W_out * q_emb)
8. Treating re-ranking task as an NLI task where document entails query
9. regressing score for each query-doc pair using nlp inspired regression by predicting score through RNN for instance
10. experimenting with listwise and pairwise approaches

In [119]:
# now expand the entires in query dict to include the body text (encoded as integers, 
# lookup using vocab_id_list/vocab_dict)

lacking_body = []
for query, query_entry in query_dict.items():
    for url, url_entry in query_entry.items():
        doc_id = doc_dict[url]
        if doc_id in docId_to_content:
            _, body_content = docId_to_content[doc_id]
            url_entry.body_content = body_content
        else:
            url_entry.body_content = None
            lacking_body.append(doc_id)


In [120]:
len(set(lacking_body))

723

In [104]:
#checking vocab etc
total_words = 0
for repeated_word in vocab_frequency:
    total_words += vocab_frequency[repeated_word]

print("Total number of words", total_words)
print("Number of unique words: ", len(vocab_id_list))
print("% of uniqueness: {:.2f} %".format((len(vocab_id_list)/total_words)*100))

Total number of words 5390418
Number of unique words:  92671
% of uniqueness: 1.72 %


In [105]:
#sanity check for a vocab_word
sample_number = np.random.choice(100, 1)[0]
vocab_word = vocab_id_list[sample_number]
vocabID = vocab_dict[vocab_word]
print(sample_number)
print(vocab_word)
assert vocabID == sample_number,  "Vocab dictionary error"
print("\n" + "--"*10 + "\n")

#sanity check on the query content and doc content
print("Sample of query content and doc content")
print("Query content:\n", queryID_to_content[0])
print("\nDoc sample:\n", docId_to_content[0])

98
5

--------------------

Sample of query content and doc content
Query content:
 [0, 1890, 2, 866]

Doc sample:
 ([0, 207, 614, 615], [179, 51, 616, 180, 0, 58, 0, 207, 617, 386, 185, 379, 386, 237, 618, 619, 620, 338, 456, 572, 621, 384, 622, 623, 181, 624, 625, 626, 627, 264, 331, 628, 338, 629, 579, 514, 630, 631, 4, 632, 122, 633, 634, 635, 636, 637, 435, 638, 639, 640, 123, 641, 642, 58, 207, 338, 643, 272, 138, 644, 645, 646, 647, 648, 138, 649, 650, 138, 651, 652, 377, 653, 654, 655, 656, 657, 658, 659, 660, 365, 661, 662, 663, 338, 353, 386, 185, 379, 386, 237, 614, 615, 664, 665, 122, 633, 24, 1321, 288, 123, 777, 1322, 58, 76, 1323, 614, 669, 76, 24, 1324, 1325, 12, 632, 1326, 1327, 259, 1328, 185, 24, 90, 386, 614, 669, 1329, 1330, 324, 12, 632, 1326, 1327, 259, 1328, 185, 24, 90, 386, 614, 669, 1331, 31, 24, 1332, 507, 12, 632, 199, 51, 1333, 24, 1334, 614, 669, 711, 733, 734, 514, 507, 579, 31, 24, 667, 1335, 45, 192, 1336, 614, 669, 1004, 370, 1005, 671, 222, 672, 785,

In [39]:
pkl.dump(doc_id_list, open("doc_id_list.p", "wb" ))

# Using collected body from web urls


## Code to collect body from web urls and build doc content

In [None]:
%%time
doc_id_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids

vocab_dict = {} #mapping from vocab term to id 
vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
vocab_frequency = {} #number of times each vocab term appears in the vocab of documents


for doc in doc_id_list:
    try:
        with urlopen(doc) as page:
            soup = BeautifulSoup(page, 'html.parser')
            body = soup.find('body')
            title = soup.find('title')
            body_content = []
            title_content = []
            content_list = [] #list containing all word IDs. To be attached to a doc

            if body is not None:
                body_content = body.text.split()

            if title is not None:
                title_content = title.text.split()

            all_content = body_content + title_content
            for word in all_content:
                word = word.strip()
                
                if word.isalnum() == False:
                    continue
                
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                content_list.append(word_id)
            doc_id_content[doc_dict[doc]] = content_list
    except:
        doc_id_content[doc_dict[doc]] = []

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
