In [1]:
#importing dependencies
import numpy as np
import os
import pickle as pkl
from urllib.request import urlopen
from bs4 import BeautifulSoup
import math
from math import log2
import utils

In [2]:
base_dir_name = os.getcwd()
data_dir_name = "project_data"
data_dir = os.path.join(base_dir_name, data_dir_name)

for item in os.listdir(data_dir):
    if not item.startswith('.') and os.path.isfile(os.path.join(data_dir, item)):
        print(item)

pa3.rel.train
BSBI.dict
docs.dict
pa3.rel.dev
pa3.signal.dev
terms.dict
pa3.signal.train


### Analyzing data

In [3]:
print("Printing signal and relevance train files...")

print("\nSignal File")
query_dict = {} #maps queries to query id (Assuming distinct queries)
doc_dict = {}  #maps docs to doc id
query_doc_dict = {} #maps query ids to list of doc ids

query_id_list = [] #list
doc_id_list = []

doc_list_for_query = []

query_repetitions = {} #dict mapping queries to number of repetitions
query_counter = 0
doc_repetitions = 0

with open(os.path.join(data_dir, "pa3.signal.train"), "r", encoding='utf8') as f:
    last_query_id = 0 
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            if query_counter >= 2:
                query_doc_dict[last_query_id] = doc_list_for_query
                
            query = " ".join(line_list[1:])
            
            if query_dict.get(query, None) != None:
                query_repetitions[query] = query_repetitions.get(query, 0) + 1
                query = query + " _" + str(query_repetitions[query])
            
            query_id_list.append(query)
            query_dict[query] = len(query_id_list) - 1
            
            #query_doc_dict[last_query_id] = doc_list_for_query
            last_query_id = len(query_id_list) - 1 #update the last query whenever a new query starts
            doc_list_for_query = [] #reinitialize the doc list whenever a new query starts
        
        elif line_list[0] == 'url:':
            assert len(line_list) == 2, "line_list for url has more than 2 entries. Please check!"
            doc = line_list[1]
            if doc_dict.get(doc, None) == None:
                doc_id_list.append(doc)
                doc_id = len(doc_id_list) -1
                doc_dict[doc] = doc_id
            else:
                doc_id = doc_dict[doc]
                
            doc_list_for_query.append(doc_id)
        
        else:
            continue
    
    query_doc_dict[last_query_id] = doc_list_for_query
            
print(query_counter)
print(len(query_dict))
print(len(query_doc_dict))
    
print("\n" + "--"*10 + "\n")

import copy
query_total_repetitions = copy.deepcopy(query_repetitions)
query_doc_relevance = {}
doc_relevance_dict = {}
query_counter = 0
print("\nRelevance File")
with open(os.path.join(data_dir, "pa3.rel.train"), "r", encoding='utf8') as f:
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            query = " ".join(line_list[1:])
            if query_repetitions.get(query, None) != None:
                query_repetition_number = query_total_repetitions[query] - query_repetitions[query]
                query_repetitions[query] -= 1
                if query_repetition_number != 0:
                    query = query + " _" + str(query_repetition_number)
            
            if query_counter >= 2:
                assert query_doc_relevance.get(last_query_id, None) == None, "Query already existed in the relevance dict"
                query_doc_relevance[last_query_id] = doc_relevance_dict
            
            last_query_id = query_dict[query]
            doc_relevance_dict = {}
            
        elif line_list[0] == "url:":
            doc = line_list[1]
            docID = doc_dict[doc]
            
            doc_relevance_dict[docID] = float(line_list[-1].strip())
    
    query_doc_relevance[last_query_id] = doc_relevance_dict

print(query_counter)
print(len(query_doc_relevance))

Printing signal and relevance train files...

Signal File
749
749
749

--------------------


Relevance File
749
749


In [4]:
sample_query = query_id_list[0]
sample_query_id = query_dict[sample_query]
retrieved_docIds = query_doc_dict[sample_query_id]
retrieved_relevance = query_doc_relevance[sample_query_id]
retrieved_docs = [doc_id_list[docId] for docId in retrieved_docIds]

print("Samples", end = "\n\n" + "."*10 + "\n")
print("Sample Query: ", sample_query, end = "\n\n" + "-"*10 + "\n")
print("Sample Query Id: ", sample_query_id, end = "\n\n" + "-"*10 + "\n")
print("Query Repetitions", query_total_repetitions, end = "\n\n" + "-"*10 + "\n")
print("Manipulated query repetitions: ", query_repetitions, end = "\n\n" + "-"*10 + "\n")
print("Retrieved doc Ids: ", retrieved_docIds, end = "\n\n" + "-"*10 + "\n")
print("Retrieved doc relevance: ", retrieved_relevance, end = "\n\n" + "-"*10 + "\n")
print("Retrieved docs: ", retrieved_docs, end = "\n\n" + "-"*10 + "\n")

Samples

..........
Sample Query:  stanford aoerc pool hours

----------
Sample Query Id:  0

----------
Query Repetitions {'facility hours': 1, 'stanford dining hours': 1, 'stanford bookstore': 1, 'aoerc pool hours': 1, 'arrillaga gym hours': 1, 'lakeside dining hours': 1, 'marguerite schedule': 1, 'computer science': 1, 'commencement schedule': 1, 'parking permit': 1, 'bookstore': 1, 'green library hours': 1, 'memorial church': 1, 'dining hall hours': 1, 'cardinal nights': 1, 'swimming pool hours': 1, 'bechtel international center': 1, 'stanford visitor parking': 1}

----------
Manipulated query repetitions:  {'facility hours': -1, 'stanford dining hours': -1, 'stanford bookstore': -1, 'aoerc pool hours': -1, 'arrillaga gym hours': -1, 'lakeside dining hours': -1, 'marguerite schedule': -1, 'computer science': -1, 'commencement schedule': -1, 'parking permit': -1, 'bookstore': -1, 'green library hours': -1, 'memorial church': -1, 'dining hall hours': -1, 'cardinal nights': -1, 'swimm

## Data Distribution Analysis

In [5]:
#number of unique queries
n_queries = len(query_dict)
n_unique_queries = len(query_dict)
for repeated_query in query_total_repetitions:
    n_unique_queries -= query_total_repetitions[repeated_query]


print("Query Analysis\n")
print("Total queries: ", n_queries)
print("Number of unique queries: ", n_unique_queries)
print("% of unique queries: ", (n_unique_queries/n_queries)*100)
print("\n" + "-"*10 + "\n")

print("Retrieved doc Analysis\n")
total_docs = len(doc_list_for_query) * n_queries
print("Number of unique docs: ", len(doc_id_list))
print("Docs per query: ", len(doc_list_for_query))
print("% of unique docs: ", (len(doc_id_list)/(len(doc_list_for_query) * n_queries))*100)
print("\n" + "-"*10 + "\n")

print("Label Analysis\n")
label_set = []
for _, relavance_dict in query_doc_relevance.items():
    _, labels = zip(*tuple(relavance_dict.items()))
    labels = list(labels)
    label_set += labels
label_set = set(label_set)
print("Set of labels: ", label_set)

Query Analysis

Total queries:  749
Number of unique queries:  731
% of unique queries:  97.59679572763686

----------

Retrieved doc Analysis

Number of unique docs:  5256
Docs per query:  10
% of unique docs:  70.173564753004

----------

Label Analysis

Set of labels:  {0.0, 1.0, 2.0, 1.5, 0.5, 3.0, 2.5, 2.3, 1.7, 1.3, 2.7, 2.2, 2.8, 0.3, 0.7, 1.8}


### Metric - NDCG 

can also incorporate Precision, MAP, etc. after binary conversion with decay rates

In [6]:
def DCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a DCG score for the retrieval/ranking.
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        DCG [float]'''
    
    discount_factor = []
    relevance_scores = []
    for i, doc in enumerate(ranked_docs):
        rank = i+1
        if rank == 1:
            discount_factor.append(1)
        else:
            discount_factor.append(log2(rank))
        relevance_scores.append(relevance_dict[doc])
    
    return sum(np.array(relevance_scores)/np.array(discount_factor))
            
def NDCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a NDCG score for the ranking. 
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        NDCG [float]'''
    ideal_ordering, _ = zip(*sorted(relevance_dict.items(), key = lambda x: (-x[1])))
    ideal_ordering = list(ideal_ordering)
    DCG_oracle = DCG(ideal_ordering, relevance_dict)
    DCG_case = DCG(ranked_docs, relevance_dict)
    
    #return 0 if DCG_ideal is 0 (happens when all the retrieved docs are rated 0)
    if DCG_oracle == 0:
        return 0.0

    return DCG_case/DCG_oracle

#sanity check
ranked_docs = [0,1,2,3,4]
relevance_dict = {0: 2, 1: 3, 2: 0, 3: 0, 4: 1}
DCG_score = 2 + 3/log2(2) + 1/log2(5)
Ideal_score = 3 + 2/log2(2) + 1/log2(3)
NDCG_score = DCG_score/Ideal_score
assert DCG(ranked_docs, relevance_dict) == DCG_score, "DCG error"
assert NDCG(ranked_docs, relevance_dict) == NDCG_score, "NDCG error"

## Random ordering accuracy

For every query, arrange the docs in random order and check the NDCG value

In [7]:
#randomly shuffles the docs
ndcg_sum = 0.0
for queryID, doc_list in query_doc_dict.items():
    np.random.shuffle(doc_list)
    ndcg_sum += NDCG(doc_list, query_doc_relevance[queryID])
print(ndcg_sum)
print(len(query_id_list))
print(ndcg_sum/len(query_id_list))

600.3326726098369
749
0.8015122464750827


## Comparing query and document embeddings

Document embeddings are obtained from the given title or header information without any weight normalization. Loop through the files and collect doc words by looking in the title and header (one idea can be to give more weight to title than to header). Lookup for each word in the glove embedding. Choose and fix a random combination of word if a word in query does not exist (maybe a combination from the words university and around because the corpus relates to stanford). Ignore otherwise. Finally, find cosine similarity and rank and compute NDCG score. 

#### Other ideas:
1. Treat upper case and start of line word different than end of line word, etc
2. Can add word correction, etc
3. How scraping documents and adding more words to document effect performance
4. Modeling item-item dependency by seq2slate architecture
5. Creating embedding for words in the query but not in the embedding vocab as a distinct combination for 
6. Training word2vec on this and then trying different ideas with the center and context matrices obtained
7. DESM type ideas with the embeddings of words in the document weighted by the similarity of words (W_out * q_emb)
8. Treating re-ranking task as an NLI task where document entails query
9. regressing score for each query-doc pair using nlp inspired regression by predicting score through RNN for instance
10. experimenting with listwise and pairwise approaches

In [8]:
#building word corpus for each document
doc_counter = 0
last_doc_content = []
docId_to_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids
queryID_to_content = {}

#vocab includes words from both, query and docs
vocab_dict = {} #mapping from vocab term to id 
vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
vocab_frequency = {} #number of times each vocab term appears in the vocab of documents (included query words)

with open(os.path.join(data_dir, "pa3.signal.train"), "r", encoding='utf8') as f:
    for line in f:
        line_list = line.split()
        
        if line_list[0] == 'query:':
            query = " ".join(line_list[1:])
            queryID = query_dict[query]
            word_id_list = []
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                word_id_list.append(word_id)
            
            queryID_to_content[queryID] = word_id_list 
            
        elif line_list[0] == 'url:':
            doc_counter += 1
            
            #printing processing
            if doc_counter % 50 == 0:
                print("Processing doc number - ", doc_counter)
                print("% complete - {:.2f} %".format((doc_counter/total_docs)*100))
                print("\n" + "--"*10 + "\n")
                
            doc = line_list[1]
            docID = doc_dict[doc]

            if doc_counter >= 2:
                docId_to_content[last_docID] = last_doc_content
                
            last_doc_content = []
            last_docID = docID
                
        elif line_list[0] == 'title:':
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                last_doc_content += [word_id, word_id] #adding each word twice for title
                
        elif line_list[0] == 'header:':
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                last_doc_content += [word_id] #adding each word twice for title
        
        else:
            continue
    
    docId_to_content[last_docID] = last_doc_content

Processing doc number -  50
% complete - 0.67 %

--------------------

Processing doc number -  100
% complete - 1.34 %

--------------------

Processing doc number -  150
% complete - 2.00 %

--------------------

Processing doc number -  200
% complete - 2.67 %

--------------------

Processing doc number -  250
% complete - 3.34 %

--------------------

Processing doc number -  300
% complete - 4.01 %

--------------------

Processing doc number -  350
% complete - 4.67 %

--------------------

Processing doc number -  400
% complete - 5.34 %

--------------------

Processing doc number -  450
% complete - 6.01 %

--------------------

Processing doc number -  500
% complete - 6.68 %

--------------------

Processing doc number -  550
% complete - 7.34 %

--------------------

Processing doc number -  600
% complete - 8.01 %

--------------------

Processing doc number -  650
% complete - 8.68 %

--------------------

Processing doc number -  700
% complete - 9.35 %

---------------

In [9]:
#checking vocab etc
total_words = 0
for repeated_word in vocab_frequency:
    total_words += vocab_frequency[repeated_word]

print("Total number of words", total_words)
print("Number of unique words: ", len(vocab_id_list))
print("% of uniqueness: {:.2f} %".format((len(vocab_id_list)/total_words)*100))

Total number of words 127082
Number of unique words:  9328
% of uniqueness: 7.34 %


In [10]:
#sanity check for a vocab_word
sample_number = np.random.choice(100, 1)[0]
vocab_word = vocab_id_list[sample_number]
vocabID = vocab_dict[vocab_word]
print(sample_number)
print(vocab_word)
assert vocabID == sample_number,  "Vocab dictionary error"
print("\n" + "--"*10 + "\n")

#sanity check on the query content and doc content
print("Sample of query content and doc content")
print("Query content:\n", queryID_to_content[0])
print("\nDoc sample:\n", docId_to_content[0])

61
tim

--------------------

Sample of query content and doc content
Query content:
 [0, 1, 2, 3]

Doc sample:
 [4, 4, 5, 5, 0, 0, 6, 6, 7, 7, 8, 8, 9, 9, 0, 10, 11, 12, 13, 14, 5, 0, 15, 16, 17, 0, 18, 0, 19, 20, 21, 22, 23, 24, 25, 26, 3, 27, 28, 29, 30, 31, 32, 0, 33, 34, 35, 36, 37, 38]


In [11]:
'''having generated query content and doc content, lets try ranking by cosine similarity between query and document 
embedding
'''
#iteration 1: ignore words not there in the embedding

#lookup function
GLOVE_HOME = os.path.join('data', 'glove.6B')
glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.50d.txt'))

In [12]:
#function to find the document embedding
def doc_embedding(docID):
    '''#Ignores words not in the vocab
    Input: docID
    Output: doc embedding'''
    
    allvecs = np.array([glove_lookup[vocab_id_list[wordID]] for wordID in docId_to_content[docID] \
                        if vocab_id_list[wordID] in glove_lookup]) 
    
    if len(allvecs) == 0:
        dim = len(next(iter(glove_lookup.values())))
        feats = np.zeros(dim)    
    else:       
        feats = np.mean(allvecs, axis=0) 
    
    return feats

#function to find the query embedding
def query_embedding(queryID):
    '''#Ignores words not in the vocab
    Input: queryID
    Output: query embedding'''
    
    allvecs = np.array([glove_lookup[vocab_id_list[wordID]] for wordID in queryID_to_content[queryID] \
                        if vocab_id_list[wordID] in glove_lookup]) 

    if len(allvecs) == 0:
        dim = len(next(iter(glove_lookup.values())))
        feats = np.zeros(dim)    
    else:       
        feats = np.mean(allvecs, axis=0) 
    
    return feats

def cosine_similarity(doc_embedding, query_embedding):
    return np.dot(doc_embedding, query_embedding)

#sample print
#print(doc_embedding(0))
#print(query_embedding(0))


#iterate over each query and list of documents and check results
ndcg_sum = 0.0
for queryID, doc_list in query_doc_dict.items():
    #doc_list is list of doc_ids for the query given by query_id
    query_relevance_dict = query_doc_relevance[queryID]
    
    query = query_id_list[queryID]
    if query.split()[-1][0] == '_':
        query = query.split()[:-1]
        query = " ".join(query)
        queryID = query_dict[query]
    
    query_emb = query_embedding(queryID)
    scores = [(docID, cosine_similarity(doc_embedding(docID), query_emb)) for docID in doc_list]
    scores = sorted(scores, key = lambda x: -x[1])
    ranked_doc_list, _ = zip(*scores)
    ranked_doc_list = list(ranked_doc_list)
    ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict)

print(ndcg_sum)
print(len(query_id_list))
print(ndcg_sum/len(query_id_list))

630.7173038033445
749
0.8420791773075361
