In [1]:
#importing dependencies
import numpy as np
import os
import pickle as pkl
import math
from math import log2
import utils
from collections import Counter, defaultdict
import copy


In [2]:
base_dir_name = os.getcwd()
data_dir_name = "project_data"
data_dir = os.path.join(base_dir_name, data_dir_name)

### Analyzing data

In [3]:
print("Printing signal and relevance train files...")

query_dict = {} #maps queries to query id (Assuming distinct queries)
doc_dict = {}  #maps docs to doc id
query_doc_dict = {} #maps query ids to list of doc ids

query_id_list = [] #list
doc_id_list = []

doc_list_for_query = []

query_repetitions = {} #dict mapping queries to number of repetitions
query_counter = 0
doc_repetitions = 0

with open(os.path.join(data_dir, "pa3.signal.train"), "r", encoding='utf8') as f:
    last_query_id = 0 
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            if query_counter >= 2:
                query_doc_dict[last_query_id] = doc_list_for_query
                
            query = " ".join(line_list[1:])
            
            if query_dict.get(query, None) != None:
                query_repetitions[query] = query_repetitions.get(query, 0) + 1
                query = query + "_" + str(query_repetitions[query])
            
            query_id_list.append(query)
            query_dict[query] = len(query_id_list) - 1
            
            last_query_id = len(query_id_list) - 1 #update the last query whenever a new query starts
            doc_list_for_query = [] #reinitialize the doc list whenever a new query starts
        
        elif line_list[0] == 'url:':
            assert len(line_list) == 2, "line_list for url has more than 2 entries. Please check!"
            doc = line_list[1]
            if doc_dict.get(doc, None) == None:
                doc_id_list.append(doc)
                doc_id = len(doc_id_list) -1
                doc_dict[doc] = doc_id
            else:
                doc_id = doc_dict[doc]
            if doc_id not in doc_list_for_query: 
                doc_list_for_query.append(doc_id)
        
        else:
            continue
    
    query_doc_dict[last_query_id] = doc_list_for_query
            
print(query_counter)
print(len(query_dict))
print(len(query_doc_dict))
    
print("\n" + "--"*10 + "\n")

query_total_repetitions = copy.deepcopy(query_repetitions)
query_doc_relevance = {}
doc_relevance_dict = {}
query_counter = 0
print("\nRelevance File")
with open(os.path.join(data_dir, "pa3.rel.train"), "r", encoding='utf8') as f:
    for line in f:
        line_list = line.split()
        if line_list[0] == 'query:':
            query_counter += 1
            query = " ".join(line_list[1:])
            if query_repetitions.get(query, None) != None:
                query_repetition_number = query_total_repetitions[query] - query_repetitions[query]
                query_repetitions[query] -= 1
                if query_repetition_number != 0:
                    query = query + "_" + str(query_repetition_number)
            
            if query_counter >= 2:
                assert query_doc_relevance.get(last_query_id, None) == None, "Query already existed in the relevance dict"
                query_doc_relevance[last_query_id] = doc_relevance_dict
            
            last_query_id = query_dict[query]
            doc_relevance_dict = {}
            
        elif line_list[0] == "url:":
            doc = line_list[1]
            docID = doc_dict[doc]
            
            doc_relevance_dict[docID] = float(line_list[-1].strip())
    
    query_doc_relevance[last_query_id] = doc_relevance_dict

print(query_counter)
print(len(query_doc_relevance))

Printing signal and relevance train files...

Signal File
749
749
749

--------------------


Relevance File
749
749


## Data Distribution Analysis

In [4]:
#number of unique queries
n_queries = len(query_dict)
n_unique_queries = len(query_dict)
for repeated_query in query_total_repetitions:
    n_unique_queries -= query_total_repetitions[repeated_query]

total_docs = len(doc_list_for_query) * n_queries
print(total_docs)

7490


## Comparing query and document embeddings

Document embeddings are obtained from the given title or header information without any weight normalization. Loop through the files and collect doc words by looking in the title and header (one idea can be to give more weight to title than to header). Lookup for each word in the glove embedding. Choose and fix a random combination of word if a word in query does not exist (maybe a combination from the words university and around because the corpus relates to stanford). Ignore otherwise. Finally, find cosine similarity and rank and compute NDCG score. 

#### Other ideas:
1. Treat upper case and start of line word different than end of line word, etc
2. Can add word correction, etc
3. How scraping documents and adding more words to document effect performance
4. Modeling item-item dependency by seq2slate architecture
5. Creating embedding for words in the query but not in the embedding vocab as a distinct combination for 
6. Training word2vec on this and then trying different ideas with the center and context matrices obtained
7. DESM type ideas with the embeddings of words in the document weighted by the similarity of words (W_out * q_emb)
8. Treating re-ranking task as an NLI task where document entails query
9. regressing score for each query-doc pair using nlp inspired regression by predicting score through RNN for instance
10. experimenting with listwise and pairwise approaches

In [5]:
#building word corpus for each document
doc_counter = 0
last_doc_content = defaultdict(list)
docId_to_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids
queryID_to_content = {}

#vocab includes words from both, query and docs
vocab_dict = {} #mapping from vocab term to id 
vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
vocab_frequency = {} #number of times each vocab term appears in the vocab of documents (included query words)

with open(os.path.join(data_dir, "pa3.signal.train"), "r", encoding='utf8') as f:
    for line in f:
        line_list = line.split()
        
        if line_list[0] == 'query:':
            query = " ".join(line_list[1:])
            queryID = query_dict[query]
            word_id_list = []
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                word_id_list.append(word_id)
            
            queryID_to_content[queryID] = word_id_list 
            
        elif line_list[0] == 'url:':
            doc_counter += 1
                
            doc = line_list[1]
            docID = doc_dict[doc]

            if doc_counter >= 2:
                docId_to_content[last_docID] = last_doc_content
                
            last_doc_content = defaultdict(list)
            last_docID = docID
                
        elif line_list[0] == 'title:':
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                last_doc_content['title'].append(word_id)
                
        elif line_list[0] == 'header:':
            for word in line_list[1:]:
                word = word.strip().lower()
                vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                if vocab_dict.get(word, None) == None:
                    vocab_id_list.append(word)
                    vocab_dict[word] = len(vocab_id_list) - 1

                word_id = vocab_dict[word]
                last_doc_content['header'].append(word_id)
        
        else:
            continue
    
    docId_to_content[last_docID] = last_doc_content

## Abstracting away data structures


In [6]:
"""
data structures we need:
    query_dict = {} #maps queries to query id (Assuming distinct queries)
    query_id_list = [] #list of queries

    doc_dict = {}  #maps urls to doc id
    doc_id_list = [] # list of urls

    query_doc_dict = {} #maps query ids to list of doc ids
    
    docId_to_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids
    queryID_to_content = {}

    #vocab includes words from both, query and docs
    vocab_dict = {} #mapping from vocab term to id 
    vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
    vocab_frequency = {} 
"""   

def get_query_string(query):
    if type(query) == int:
        query = query_id_list[query]
    assert type(query) == str, query
    return query

def get_query_id(query):
    if type(query) == str:
        query = query_dict[query]
    assert type(query) == int, query
    return query
 
def get_doc_url(doc):
    if type(doc) == int:
        doc = doc_id_list[doc]
    assert type(doc) == str, doc
    return doc

def get_doc_id(doc):
    if type(doc) == str:
        doc = doc_dict[doc]
    assert type(doc) == int, doc
    return doc

def ids_to_words(content):
    # convert ids to words
    return [vocab_id_list[i] if type(i) == int else i for i in content]
    #return [vocab_id_list[i] for i in content]
    
def words_to_ids(words):
    return [vocab_dict[w] if type(w) == str else w for w in words]
    #return [vocab_dict[w] for w in words]

def register_words(words):
    for word in words:
        if word not in vocab_dict:
            vocab_dict[word] = len(vocab_id_list)
            vocab_id_list.append(word)

def get_query_words(query):
    # return a list of words corresponding to the query (either string query or query_id)
    # could use queryID_to_content, but this has issues with duplicates!
    query = get_query_string(query)
    return query.split('_')[0].split(' ')
    
def get_doc_words(document, content_type):
    # given either url or doc_id
    document = get_doc_id(document)
    # all documents have a title at least
    title_content = docId_to_content[document]['title']
    if content_type == 'title':
        content = title_content
    
    elif content_type == 'header':
        if docId_to_content[document]['header']:
            content = docId_to_content[document]['header']
        else:
            content = title_content
    
    elif content_type == '2th':
        content = 2*title_content
        content += docId_to_content[document]['header']
    
    elif content_type == 'body':
        #if docId_to_content[document]['body']:
        #    content = docId_to_content[document]['body']
        #else:
        #    content = title_content
         content = docId_to_content[document]['body']
        
    else:
        raise ValueError("Invalid content type: {}".format(content_type))
    return ids_to_words(content)

def get_all_doc_words(query, content_type):
    # return tuples of (url, content) corresponding 
    query = get_query_id(query)
    documents = [get_doc_url(doc) for doc in query_doc_dict[query]]
    return [(doc, get_doc_words(doc, content_type)) for doc in documents]

def get_relevance_dict(query):
    query = get_query_id(query)
    return {get_doc_url(k): v for k, v in query_doc_relevance[query].items()}

def query_iter():
    for query in query_dict:
        yield query

def url_iter(query):
    for doc in query_doc_dict[get_query_id(query)]:
        yield get_doc_url(doc)
        
# Note: from here on out, you NEVER have to touch a datastructure, just use the functions above

## Inject the body content (by url) into docId_to_content when available

In [7]:
if not os.path.exists("web_url_to_body.p"):
    print("Making dictionary")
    web_vocab_id_list = pkl.load(open('vocab_id_list.p', 'rb'))
    web_vocab_dict = pkl.load(open('vocab_dict.p', 'rb'))
    web_docId_to_content = pkl.load(open('doc_id_content.p', 'rb'))
    web_doc_id_list = pkl.load(open("doc_id_list.p", "rb"))
    web_doc_dict = pkl.load(open("doc_dict.p", "rb"))
    web_url_to_words = {}

    for url, doc in web_doc_dict.items():
        body_content = []
        if doc in web_docId_to_content:
            _, body_content = web_docId_to_content[doc]

        body_content = [web_vocab_id_list[w] for w in body_content]
        web_url_to_words[url] = body_content
    
    pkl.dump(web_url_to_words, open("web_url_to_body.p", "wb"))

else:
    print("Loading dictionary")
    web_url_to_words = pkl.load(open("web_url_to_body.p", "rb"))

Loading dictionary


In [8]:
present, missing = 0, 0
for query in query_iter():
    for url in url_iter(query):
        if url in web_url_to_words:
            content = web_url_to_words[url]
            doc_id = get_doc_id(url)
            register_words(content)
            content = words_to_ids(content)
            docId_to_content[doc_id]['body'] = content
            present += 1
        else:
            missing += 1
present, missing

(7201, 0)

In [9]:
# some sanity checks
for query in query_iter():
    print(get_query_words(query))
    print()
    print(get_all_doc_words(query, 'title'))
    print()
    print(get_relevance_dict(query))
    print()
    for url in url_iter(query):
        print(url)
    break


['stanford', 'aoerc', 'pool', 'hours']

[('http://events.stanford.edu/2014/February/18/', ['skip', 'to', 'main', 'content', 'stanford', 'university', 'stanford', 'event', 'calendar', 'search', 'for', 'events', 'search', 'button', 'menu', 'featured', 'today', 'by', 'date', 'current', 'month', 'su', 'mo', 'tu', 'we', 'th', 'fr', 'sa', 'javascript', 'must', 'be', 'enabled', 'by', 'type', 'class', 'conference', 'symposium', 'exhibition', 'information', 'session', 'lecture', 'reading', 'talk', 'meeting', 'performance', 'oral', 'recreation', 'sport', 'religious', 'screening', 'seminar', 'social', 'tour', 'university', 'event', 'by', 'subject', 'all', 'arts', 'dance', 'drama', 'theater', 'film', 'literary', 'arts', 'music', 'visual', 'arts', 'careers', 'diversity', 'education', 'engineering', 'environment', 'sustainability', 'health', 'wellness', 'humanities', 'international', 'public', 'service', 'science', 'women', 'gender', 'by', 'organization', 'search', 'for', 'events', 'search', 'button

## Set up GloVe embedding

In [10]:
'''having generated query content and doc content, lets try ranking by cosine similarity between query and document 
embedding
'''
#iteration 1: ignore words not there in the embedding

#lookup function
glove_dim = 50
GLOVE_HOME = os.path.join('data', 'glove.6B')
glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.{}d.txt'.format(glove_dim)))

In [11]:
def make_glove_embedding(words, combine_func=None):
    for word in words:
        assert isinstance(word, str), (type(word), word)

    all_vecs = np.array([glove_lookup[w] for w in words if w in glove_lookup]) 

    if len(all_vecs) == 0:
        feats = np.zeros(glove_dim)    
    else:       
        if combine_func:
            print(combine_func)
            feats = combine_func(all_vecs)
        else: # take the elemnetwise mean by default
            feats = np.mean(all_vecs, axis=0) 
    return feats

def query_and_document_embeddings(query, query_combine_func=None, doc_combine_func=None, content_type='title'):
    """
    query: Either query text, or id
    query_combine_func: How to combine query GloVe embeddings (default is mean)
    doc_combine_func: How to combine document GloVe embeddings (default is mean)
    doc_content_type: How to select document content. TODO: make this do something
    """
    query_words = get_query_words(query)
    query_embedding = make_glove_embedding(query_words, query_combine_func)
        
    document_embeddings = [(url, make_glove_embedding(words, doc_combine_func)) 
                               for url, words in get_all_doc_words(query, content_type)]
    return query_embedding, document_embeddings

### Metric - NDCG, MAP

can also incorporate Precision, MAP, etc. after binary conversion with decay rates

In [12]:
def DCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a DCG score for the retrieval/ranking.
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        DCG [float]'''
    return np.sum([(relevance_dict[doc]) / (math.log2(i+2)) \
                  for i, doc in enumerate(ranked_docs)])

def DCG_alt(ranked_docs, relevance_dict):
    return np.sum([(2**relevance_dict[doc] - 1) / (math.log2(i+2)) \
                   for i, doc in enumerate(ranked_docs)])

def NDCG(ranked_docs, relevance_dict, use_alt=False):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a NDCG score for the ranking. 
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        NDCG [float]'''
    assert len(ranked_docs) == len(relevance_dict)
    ideal_ordering, _ = zip(*sorted(relevance_dict.items(), key = lambda x: (-x[1])))
    ideal_ordering = list(ideal_ordering)
    
    dcg_func = DCG_alt if use_alt else DCG
    DCG_oracle = dcg_func(ideal_ordering, relevance_dict)
    DCG_case = dcg_func(ranked_docs, relevance_dict)
    assert DCG_oracle >= DCG_case
    
    #return 0 if DCG_ideal is 0 (happens when all the retrieved docs are rated 0)
    if DCG_oracle == 0:
        return 1.0

    return DCG_case/DCG_oracle

#sanity check
ranked_docs = [0,1,2,3,4]
relevance_dict = {0: 2, 1: 3, 2: 0, 3: 0, 4: 1}
DCG_score = 2 + 3/log2(3) + 1/log2(6)
Ideal_score = 3 + 2/log2(3) + 1/log2(4)
NDCG_score = DCG_score/Ideal_score
assert DCG(ranked_docs, relevance_dict) == DCG_score, "DCG error"
assert NDCG(ranked_docs, relevance_dict) == NDCG_score, "NDCG error"

In [13]:
def average_precision_helper(relevance_list):
    precision = 0.0
    relevant_so_far = 0.0
    for i, val in enumerate(relevance_list):
        relevant_so_far += val
        precision += relevant_so_far / (i+1)
    return precision / len(relevance_list)

def average_precision(ranked_doc_list, query_relevance_dict):
    relevance_list = [1 if query_relevance_dict[doc] >= 1.0 else 0 for doc in ranked_doc_list]
    return average_precision_helper(relevance_list)

expected = (1 + 1 + 2/3 + 2/4 + 3/5 + 3/6 + 4/7)/ 7
actual = average_precision_helper([1, 1, 0, 0, 1, 0, 1])
assert expected == actual, actual


In [14]:
def random_similarity(doc_embedding, query_embedding):
    return np.random.uniform()

def cosine_similarity(doc_embedding, query_embedding):
    norm = np.linalg.norm(doc_embedding)
    if norm > 0:
        doc_embedding /= norm
    return np.dot(doc_embedding, query_embedding)


In [15]:
def run_metrics(scoring_func=cosine_similarity, content_type='title'):
    ndcg_sum = 0.0
    alt_ndcg_sum = 0.0
    precision_sum = 0.0
    for query in query_iter():
        query_relevance_dict = get_relevance_dict(query)
        query_embedding, document_embeddings = query_and_document_embeddings(query, content_type=content_type)
        
        scores = [(url, scoring_func(doc_emb, query_embedding)) for url, doc_emb in document_embeddings]
        scores = sorted(scores, key = lambda x: x[1], reverse=True)
        ranked_doc_list, _ = zip(*scores)
        ranked_doc_list = list(ranked_doc_list)

        ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict)
        alt_ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict, use_alt=True)
        precision_sum += average_precision(ranked_doc_list, query_relevance_dict)
        
    ndcg_sum /= len(query_dict)
    alt_ndcg_sum  /= len(query_dict)
    precision_sum /= len(query_dict)
    return {'NDCG': ndcg_sum,
            'Alt_NDCG': alt_ndcg_sum,
            'MAP': precision_sum,}

## Random ordering accuracy

For every query, arrange the docs in random order and check the NDCG value

In [16]:
n = 10
sum_metric = {}
for _ in range(n):
    for m, v in run_metrics(scoring_func=random_similarity).items():
        sum_metric[m] = sum_metric.get(m, 0) + v / n
sum_metric

{'NDCG': 0.8105896211335066,
 'Alt_NDCG': 0.7436074341489898,
 'MAP': 0.7258456069417832}

In [17]:
run_metrics(scoring_func=cosine_similarity, content_type='title')

{'NDCG': 0.8647397159664021,
 'Alt_NDCG': 0.815829927971894,
 'MAP': 0.776858360277802}

In [18]:
run_metrics(scoring_func=cosine_similarity, content_type='header')

{'NDCG': 0.8552298809465102,
 'Alt_NDCG': 0.8019164999021395,
 'MAP': 0.7652650542118645}

In [19]:
run_metrics(scoring_func=cosine_similarity, content_type='2th')

{'NDCG': 0.8667825897043977,
 'Alt_NDCG': 0.8177130743614507,
 'MAP': 0.7774704992143731}

In [20]:
run_metrics(scoring_func=cosine_similarity, content_type='body')

{'NDCG': 0.84877473860534,
 'Alt_NDCG': 0.7917931581210282,
 'MAP': 0.7621328885730236}