In [82]:
#importing dependencies
import numpy as np
import os
import pickle as pkl
import math
from math import log2
import utils
from collections import Counter, defaultdict
import copy
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

In [2]:
base_dir_name = os.getcwd()
data_dir_name = "project_data"
data_dir = os.path.join(base_dir_name, data_dir_name)

### Analyzing data

In [3]:
def load_data(signal_filename, relevance_filename, vocab_dict, vocab_id_list, vocab_frequency):
    
    # We will return these 7 things!
    query_dict = {} #maps queries to query id (Assuming distinct queries)
    doc_dict = {}  #maps docs to doc id
    query_doc_dict = {} #maps query ids to list of doc ids
    query_id_list = [] #list
    doc_id_list = []
    query_doc_relevance = {}
    docId_to_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids
    results_dict = {
        'query_dict': query_dict,
        'doc_dict': doc_dict,
        'query_doc_dict': query_doc_dict,
        'query_id_list': query_id_list,
        'doc_id_list': doc_id_list,
        'query_doc_relevance': query_doc_relevance,
        'docId_to_content': docId_to_content,
    }
    
    doc_list_for_query = []
    query_repetitions = {} #dict mapping queries to number of repetitions
    query_counter = 0
    doc_repetitions = 0

    with open(os.path.join(data_dir, signal_filename), "r", encoding='utf8') as f:
        last_query_id = 0 
        for line in f:
            line_list = line.split()
            if line_list[0] == 'query:':
                query_counter += 1
                if query_counter >= 2:
                    query_doc_dict[last_query_id] = doc_list_for_query

                query = " ".join(line_list[1:])

                if query_dict.get(query, None) != None:
                    query_repetitions[query] = query_repetitions.get(query, 0) + 1
                    query = query + "_" + str(query_repetitions[query])

                query_id_list.append(query)
                query_dict[query] = len(query_id_list) - 1

                last_query_id = len(query_id_list) - 1 #update the last query whenever a new query starts
                doc_list_for_query = [] #reinitialize the doc list whenever a new query starts

            elif line_list[0] == 'url:':
                assert len(line_list) == 2, "line_list for url has more than 2 entries. Please check!"
                doc = line_list[1]
                if doc_dict.get(doc, None) == None:
                    doc_id_list.append(doc)
                    doc_id = len(doc_id_list) -1
                    doc_dict[doc] = doc_id
                else:
                    doc_id = doc_dict[doc]
                if doc_id not in doc_list_for_query: 
                    doc_list_for_query.append(doc_id)

            else:
                continue

        query_doc_dict[last_query_id] = doc_list_for_query


    query_total_repetitions = copy.deepcopy(query_repetitions)
    doc_relevance_dict = {}
    query_counter = 0
    with open(os.path.join(data_dir, relevance_filename), "r", encoding='utf8') as f:
        for line in f:
            line_list = line.split()
            if line_list[0] == 'query:':
                query_counter += 1
                query = " ".join(line_list[1:])
                if query_repetitions.get(query, None) != None:
                    query_repetition_number = query_total_repetitions[query] - query_repetitions[query]
                    query_repetitions[query] -= 1
                    if query_repetition_number != 0:
                        query = query + "_" + str(query_repetition_number)

                if query_counter >= 2:
                    assert query_doc_relevance.get(last_query_id, None) == None, "Query already existed in the relevance dict"
                    query_doc_relevance[last_query_id] = doc_relevance_dict

                last_query_id = query_dict[query]
                doc_relevance_dict = {}

            elif line_list[0] == "url:":
                doc = line_list[1]
                docID = doc_dict[doc]

                doc_relevance_dict[docID] = float(line_list[-1].strip())

        query_doc_relevance[last_query_id] = doc_relevance_dict
        
    #building word corpus for each document
    doc_counter = 0
    last_doc_content = defaultdict(list)

    with open(os.path.join(data_dir, signal_filename), "r", encoding='utf8') as f:
        for line in f:
            line_list = line.split()

            if line_list[0] == 'query:':
                query = " ".join(line_list[1:])
                queryID = query_dict[query]
                for word in line_list[1:]:
                    word = word.strip().lower()
                    vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                    if vocab_dict.get(word, None) == None:
                        vocab_id_list.append(word)
                        vocab_dict[word] = len(vocab_id_list) - 1

            elif line_list[0] == 'url:':
                doc_counter += 1

                doc = line_list[1]
                docID = doc_dict[doc]

                if doc_counter >= 2:
                    docId_to_content[last_docID] = last_doc_content

                last_doc_content = defaultdict(list)
                last_docID = docID

            elif line_list[0] == 'title:':
                for word in line_list[1:]:
                    word = word.strip().lower()
                    vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                    if vocab_dict.get(word, None) == None:
                        vocab_id_list.append(word)
                        vocab_dict[word] = len(vocab_id_list) - 1

                    word_id = vocab_dict[word]
                    last_doc_content['title'].append(word_id)

            elif line_list[0] == 'header:':
                for word in line_list[1:]:
                    word = word.strip().lower()
                    vocab_frequency[word] = vocab_frequency.get(word, 0) + 1

                    if vocab_dict.get(word, None) == None:
                        vocab_id_list.append(word)
                        vocab_dict[word] = len(vocab_id_list) - 1

                    word_id = vocab_dict[word]
                    last_doc_content['header'].append(word_id)

            else:
                continue

        docId_to_content[last_docID] = last_doc_content
    return results_dict

In [4]:
#vocab includes words from both, query and docs
vocab_dict = {} #mapping from vocab term to id 
vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
vocab_frequency = {} #number of times each vocab term appears in the vocab of documents (included query words)

train_dict = load_data("pa3.signal.train", "pa3.rel.train", vocab_dict, vocab_id_list, vocab_frequency)
dev_dict = load_data("pa3.signal.dev", "pa3.rel.dev", vocab_dict, vocab_id_list, vocab_frequency)
pkl.dump(dev_dict['doc_id_list'], open('dev_doc_id_list.p', 'wb'))
pkl.dump(dev_dict['doc_dict'], open('dev_doc_dict.p', 'wb'))


## Comparing query and document embeddings

Document embeddings are obtained from the given title or header information without any weight normalization. Loop through the files and collect doc words by looking in the title and header (one idea can be to give more weight to title than to header). Lookup for each word in the glove embedding. Choose and fix a random combination of word if a word in query does not exist (maybe a combination from the words university and around because the corpus relates to stanford). Ignore otherwise. Finally, find cosine similarity and rank and compute NDCG score. 

#### Other ideas:
1. Treat upper case and start of line word different than end of line word, etc
2. Can add word correction, etc
3. How scraping documents and adding more words to document effect performance
4. Modeling item-item dependency by seq2slate architecture
5. Creating embedding for words in the query but not in the embedding vocab as a distinct combination for 
6. Training word2vec on this and then trying different ideas with the center and context matrices obtained
7. DESM type ideas with the embeddings of words in the document weighted by the similarity of words (W_out * q_emb)
8. Treating re-ranking task as an NLI task where document entails query
9. regressing score for each query-doc pair using nlp inspired regression by predicting score through RNN for instance
10. experimenting with listwise and pairwise approaches

## Abstracting away data structures


In [5]:
"""
data structures we need:
    query_dict = {} #maps queries to query id (Assuming distinct queries)
    query_id_list = [] #list of queries

    doc_dict = {}  #maps urls to doc id
    doc_id_list = [] # list of urls

    query_doc_dict = {} #maps query ids to list of doc ids
    
    docId_to_content = {} #dict maps from doc id to the contents in the doc. The content is saved as a list of vocab_ids

    #vocab includes words from both, query and docs
    vocab_dict = {} #mapping from vocab term to id 
    vocab_id_list = [] #list where id maps to the vocab term (0 indexed)
    vocab_frequency = {} 
"""   
# dataset_dict is an argument to most of these, 
# as we need to know if we are dealing with train set, dev set
def get_query_string(dataset_dict, query):
    if type(query) == int:
        query = dataset_dict['query_id_list'][query]
    assert type(query) == str, query
    return query

def get_query_id(dataset_dict, query):
    if type(query) == str:
        query = dataset_dict['query_dict'][query]
    assert type(query) == int, query
    return query
 
def get_doc_url(dataset_dict, doc):
    if type(doc) == int:
        doc = dataset_dict['doc_id_list'][doc]
    assert type(doc) == str, doc
    return doc

def get_doc_id(dataset_dict, doc):
    if type(doc) == str:
        doc = dataset_dict['doc_dict'][doc]
    assert type(doc) == int, doc
    return doc

# this should be common to everything across train/dev
def ids_to_words(content):
    # convert ids to words
    return [vocab_id_list[i] if type(i) == int else i for i in content]
    #return [vocab_id_list[i] for i in content]

# this should be common to everything across train/dev
def words_to_ids(words):
    return [vocab_dict[w] if type(w) == str else w for w in words]
    #return [vocab_dict[w] for w in words]

# this should be common to everything across train/dev
def register_words(words):
    for word in words:
        if word not in vocab_dict:
            vocab_dict[word] = len(vocab_id_list)
            vocab_id_list.append(word)

def get_query_words(dataset_dict, query):
    # return a list of words corresponding to the query (either string query or query_id)
    query = get_query_string(dataset_dict, query)
    return query.split('_')[0].split(' ')
    
def get_doc_words(dataset_dict, document, content_type):
    # given either url or doc_id
    document = get_doc_id(dataset_dict, document)
    # all documents have a title at least
    doc_to_content = dataset_dict['docId_to_content']
    title_content = doc_to_content[document]['title']
    
    if content_type == 'title':
        content = title_content
    
    elif content_type == 'header':
        if doc_to_content[document]['header']:
            content = doc_to_content[document]['header']
        else:
            content = title_content
    
    elif content_type == '2th':
        content = 2*title_content
        content += doc_to_content[document]['header']
    
    elif content_type == 'body':
        if doc_to_content[document]['body']:
            content = doc_to_content[document]['body']
        else:
            content = title_content
        
    else:
        raise ValueError("Invalid content type: {}".format(content_type))

    return ids_to_words(content)

def get_all_doc_words(dataset_dict, query, content_type):
    # return tuples of (url, content) corresponding 
    query = get_query_id(dataset_dict, query)
    query_doc_dict = dataset_dict['query_doc_dict']
    documents = [get_doc_url(dataset_dict, doc) for doc in query_doc_dict[query]]
    return [(doc, get_doc_words(dataset_dict, doc, content_type)) for doc in documents]

def get_relevance_dict(dataset_dict, query):
    query = get_query_id(dataset_dict, query)
    query_doc_relevance = dataset_dict['query_doc_relevance']
    return {get_doc_url(dataset_dict, k): v for k, v in query_doc_relevance[query].items()}

def query_iter(dataset_dict):
    query_dict = dataset_dict['query_dict']
    for query in query_dict:
        yield query

def url_iter(dataset_dict, query):
    query_doc_dict = dataset_dict['query_doc_dict']
    query = get_query_id(dataset_dict, query)

    for doc in query_doc_dict[query]:
        yield get_doc_url(dataset_dict, doc)
        
# Note: from here on out, you NEVER have to touch a datastructure, just use the functions above

## Inject the body content (by url) into docId_to_content when available

In [6]:
def make_url_to_body(s="train"):
    name = "{}_web_url_to_body.p".format(s)
    if not os.path.exists(name):
        print("Making dictionary")
        web_vocab_id_list = pkl.load(open('{}_vocab_id_list.p'.format(s), 'rb'))
        web_vocab_dict = pkl.load(open('{}_vocab_dict.p'.format(s), 'rb'))
        web_docId_to_content = pkl.load(open('{}_doc_id_content.p'.format(s), 'rb'))
        web_doc_id_list = pkl.load(open("{}_doc_id_list.p".format(s), "rb"))
        web_doc_dict = pkl.load(open("{}_doc_dict.p".format(s), "rb"))
        web_url_to_words = {}

        for url, doc in web_doc_dict.items():
            body_content = []
            if doc in web_docId_to_content:
                _, body_content = web_docId_to_content[doc]

            body_content = [web_vocab_id_list[w] for w in body_content]
            web_url_to_words[url] = body_content

        pkl.dump(web_url_to_words, open(name, "wb"))

    else:
        print("Loading dictionary")
        web_url_to_words = pkl.load(open(name, "rb"))
    return web_url_to_words

train_web_url_to_words = make_url_to_body('train')
dev_web_url_to_words = make_url_to_body('dev')

Loading dictionary
Loading dictionary


In [7]:
def fill_in(dataset_dict, web_url_to_words):
    present, missing = 0, 0
    for query in query_iter(dataset_dict):
        for url in url_iter(dataset_dict, query):
            if url in web_url_to_words:
                content = web_url_to_words[url]
                doc_id = get_doc_id(dataset_dict, url)
                register_words(content)
                content = words_to_ids(content)
                docId_to_content = dataset_dict['docId_to_content']
                docId_to_content[doc_id]['body'] = content
                present += 1
            else:
                missing += 1
    return present, missing

print(fill_in(train_dict, train_web_url_to_words))
print(fill_in(dev_dict, dev_web_url_to_words))


(7201, 0)
(1187, 0)


In [8]:
# some sanity checks
for query in query_iter(train_dict):
    print(get_query_words(train_dict, query))
    print()
    print(get_all_doc_words(train_dict, query, 'title'))
    print()
    print(get_relevance_dict(train_dict, query))
    print()
    for url in url_iter(train_dict, query):
        print(url)
    break


['stanford', 'aoerc', 'pool', 'hours']

[('http://events.stanford.edu/2014/February/18/', ['events', 'at', 'stanford', 'tuesday', 'february', '18', '2014']), ('http://events.stanford.edu/2014/February/6/', ['events', 'at', 'stanford', 'thursday', 'february', '6', '2014']), ('http://events.stanford.edu/2014/March/13/', ['events', 'at', 'stanford', 'thursday', 'march', '13', '2014']), ('http://events.stanford.edu/2014/March/3/', ['events', 'at', 'stanford', 'monday', 'march', '3', '2014']), ('http://med.stanford.edu/content/dam/sm/hip/documents/FreeFitnessWeek.pdf', ['ffw', 'spring', '2017', 'schedule']), ('http://web.stanford.edu/group/masters/pool.html', ['stanford', 'masters', 'swimming', 'pool', '&', 'parking', 'information']), ('https://alumni.stanford.edu/get/page/perks/PoolAndGyms', ['pool', '&', 'gyms']), ('https://cardinalrec.stanford.edu/facilities/aoerc/', []), ('https://explorecourses.stanford.edu/search?view=catalog&filter-coursestatus-Active=on&page=0&catalog=&q=PE+128%3A+S

In [9]:
for query in query_iter(dev_dict):
    print(get_query_words(dev_dict, query))
    print()
    print(get_all_doc_words(dev_dict, query, 'title'))
    print()
    print(get_relevance_dict(dev_dict, query))
    print()
    for url in url_iter(dev_dict, query):
        print(url)
    break

['lost', 'axess', 'password', 'help']

[('https://accounts.stanford.edu/', ['stanford', 'accounts']), ('https://accounts.stanford.edu/resetpw', ['stanford', 'accounts', 'reset', 'password', 'step', '1', 'of', '4']), ('https://uit.stanford.edu/announcements/security', ['announcements', 'university', 'it']), ('https://uit.stanford.edu/service/webauth/twostep', ['two', 'step', 'authentication', 'university', 'it']), ('https://uit.stanford.edu/service/webauth/twostep/bypass_code', ['how', 'to', 'generate', 'a', 'bypass', 'code', 'for', 'a', 'lost', 'or', 'forgotten', 'two', 'step', 'authentication', 'device', 'university', 'it']), ('https://uit.stanford.edu/service/webauth/twostep/printed_list', ['how', 'to', 'use', 'a', 'printed', 'list', 'for', 'two', 'step', 'authentication', 'university', 'it']), ('https://uit.stanford.edu/service/webauth/twostep/push', ['how', 'to', 'authenticate', 'with', 'a', 'duo', 'push', 'notification', 'for', 'two', 'step', 'authentication', 'university', 'it'])

## Set up GloVe embedding

In [14]:
'''having generated query content and doc content, lets try ranking by cosine similarity between query and document 
embedding
'''
#iteration 1: ignore words not there in the embedding

#lookup function
GLOVE_HOME = os.path.join(base_dir_name, os.path.join('data', 'glove.6B'))
glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.100d.txt'))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ayushgupta/Desktop/CS224U/GitRepo/cs224u/data/glove.6B/glove.6B.100d.txt'

In [36]:
def make_glove_embedding(words, combine_func=None):
    for word in words:
        assert isinstance(word, str), (type(word), word)

    all_vecs = np.array([glove_lookup[w] for w in words if w in glove_lookup]) 

    if len(all_vecs) == 0:
        feats = np.zeros(glove_dim)    
    else:       
        if combine_func:
            feats = combine_func(all_vecs)
        else: # take the elemnetwise mean by default
            feats = np.mean(all_vecs, axis=0) 
    return feats

def query_and_document_embeddings(dataset_dict, query, query_combine_func=None, doc_combine_func=None, content_type='title'):
    """
    query: Either query text, or id
    query_combine_func: How to combine query GloVe embeddings (default is mean)
    doc_combine_func: How to combine document GloVe embeddings (default is mean)
    doc_content_type: How to select document content. TODO: make this do something
    """
    query_words = get_query_words(dataset_dict, query)
    query_embedding = make_glove_embedding(query_words, query_combine_func)
        
    document_embeddings = [(url, make_glove_embedding(words, doc_combine_func)) 
                               for url, words in get_all_doc_words(dataset_dict, query, content_type)]
    return query_embedding, document_embeddings

### Metric - NDCG, MAP

can also incorporate Precision, MAP, etc. after binary conversion with decay rates

In [12]:
def DCG(ranked_docs, relevance_dict):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a DCG score for the retrieval/ranking.
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        DCG [float]'''
    return np.sum([(relevance_dict[doc]) / (math.log2(i+2)) \
                  for i, doc in enumerate(ranked_docs)])

def DCG_alt(ranked_docs, relevance_dict):
    return np.sum([(2**relevance_dict[doc] - 1) / (math.log2(i+2)) \
                   for i, doc in enumerate(ranked_docs)])

def NDCG(ranked_docs, relevance_dict, use_alt=False):
    '''This function takes an ordered/ranked document list with the ground truth relevance labels from the 
    relevance_dict and returns a NDCG score for the ranking. 
    Input -- 
        ranked_docs = list of doc IDs ordered by rank. First element in the list is the highest ranked
        relevance_dict = dict with keys as the doc_IDs and relevance score as the element
    Output -- 
        NDCG [float]'''
    assert len(ranked_docs) == len(relevance_dict)
    ideal_ordering, _ = zip(*sorted(relevance_dict.items(), key = lambda x: (-x[1])))
    ideal_ordering = list(ideal_ordering)
    
    dcg_func = DCG_alt if use_alt else DCG
    DCG_oracle = dcg_func(ideal_ordering, relevance_dict)
    DCG_case = dcg_func(ranked_docs, relevance_dict)
    assert DCG_oracle >= DCG_case
    
    #return 0 if DCG_ideal is 0 (happens when all the retrieved docs are rated 0)
    if DCG_oracle == 0:
        return 1.0

    return DCG_case/DCG_oracle

#sanity check
ranked_docs = [0,1,2,3,4]
relevance_dict = {0: 2, 1: 3, 2: 0, 3: 0, 4: 1}
DCG_score = 2 + 3/log2(3) + 1/log2(6)
Ideal_score = 3 + 2/log2(3) + 1/log2(4)
NDCG_score = DCG_score/Ideal_score
assert DCG(ranked_docs, relevance_dict) == DCG_score, "DCG error"
assert NDCG(ranked_docs, relevance_dict) == NDCG_score, "NDCG error"

In [13]:
def average_precision_helper(relevance_list):
    precision = 0.0
    relevant_so_far = 0.0
    for i, val in enumerate(relevance_list):
        relevant_so_far += val
        precision += relevant_so_far / (i+1)
    return precision / len(relevance_list)

def average_precision(ranked_doc_list, query_relevance_dict):
    relevance_list = [1 if query_relevance_dict[doc] >= 1.0 else 0 for doc in ranked_doc_list]
    return average_precision_helper(relevance_list)

expected = (1 + 1 + 2/3 + 2/4 + 3/5 + 3/6 + 4/7)/ 7
actual = average_precision_helper([1, 1, 0, 0, 1, 0, 1])
assert expected == actual, actual


In [14]:
def random_similarity(doc_embedding, query_embedding):
    return np.random.uniform()

def cosine_similarity(doc_embedding, query_embedding):
    norm = np.linalg.norm(doc_embedding)
    if norm > 0:
        doc_embedding /= norm
    return np.dot(doc_embedding, query_embedding)


In [15]:
def run_metrics(dataset_dict, scoring_func=cosine_similarity, content_type='title'):
    ndcg_sum = 0.0
    alt_ndcg_sum = 0.0
    precision_sum = 0.0
    n = 0
    for query in query_iter(dataset_dict):
        n += 1
        query_relevance_dict = get_relevance_dict(dataset_dict, query)
        query_embedding, document_embeddings = query_and_document_embeddings(dataset_dict, query, content_type=content_type)
        
        scores = [(url, scoring_func(doc_emb, query_embedding)) for url, doc_emb in document_embeddings]
        scores = sorted(scores, key = lambda x: x[1], reverse=True)
        ranked_doc_list, _ = zip(*scores)
        ranked_doc_list = list(ranked_doc_list)

        ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict)
        alt_ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict, use_alt=True)
        precision_sum += average_precision(ranked_doc_list, query_relevance_dict)
        
    ndcg_sum /= n
    alt_ndcg_sum  /= n
    precision_sum /= n
    return {'NDCG': ndcg_sum,
            'Alt_NDCG': alt_ndcg_sum,
            'MAP': precision_sum,}

## Random ordering accuracy

For every query, arrange the docs in random order and check the NDCG value

In [16]:
n = 10
sum_metric = {}
for _ in range(n):
    for m, v in run_metrics(train_dict, scoring_func=random_similarity).items():
        sum_metric[m] = sum_metric.get(m, 0) + v / n
print("Random on train")
sum_metric

Random on train


{'NDCG': 0.8098059423920467,
 'Alt_NDCG': 0.7415840332472398,
 'MAP': 0.726820962089034}

In [17]:
n = 10
sum_metric = {}
for _ in range(n):
    for m, v in run_metrics(dev_dict, scoring_func=random_similarity).items():
        sum_metric[m] = sum_metric.get(m, 0) + v / n
print("Random on dev")
sum_metric

Random on dev


{'NDCG': 0.8109561178018261,
 'Alt_NDCG': 0.7390676338486832,
 'MAP': 0.7204487311949871}

# Some non-random methods


In [18]:
print("Title on train")
run_metrics(train_dict, scoring_func=cosine_similarity, content_type='title')

Title on train


{'NDCG': 0.8651824860113435,
 'Alt_NDCG': 0.8161245513899243,
 'MAP': 0.7771249238087184}

In [19]:
print("Title on dev")
run_metrics(dev_dict, scoring_func=cosine_similarity, content_type='title')

Title on dev


{'NDCG': 0.8610811796421569,
 'Alt_NDCG': 0.8111389290875022,
 'MAP': 0.7623313263477439}

In [20]:
print("Header on train")
run_metrics(train_dict, scoring_func=cosine_similarity, content_type='header')

Header on train


{'NDCG': 0.8568503022704564,
 'Alt_NDCG': 0.8042551360163497,
 'MAP': 0.7661671476556811}

In [21]:
print("Header on dev")
run_metrics(dev_dict, scoring_func=cosine_similarity, content_type='header')

Header on dev


{'NDCG': 0.8595995880298966,
 'Alt_NDCG': 0.8048907651704151,
 'MAP': 0.7704660869724235}

In [22]:
print("2*Title+header on train")
run_metrics(train_dict, scoring_func=cosine_similarity, content_type='2th')

2*Title+header on train


{'NDCG': 0.8685425184105349,
 'Alt_NDCG': 0.8196390620542968,
 'MAP': 0.778713179048508}

In [23]:
print("2*Title+header on dev")
run_metrics(dev_dict, scoring_func=cosine_similarity, content_type='2th')

2*Title+header on dev


{'NDCG': 0.8702515039354958,
 'Alt_NDCG': 0.8221430739588372,
 'MAP': 0.776288219588911}

In [24]:
print("Body on train")
run_metrics(train_dict, scoring_func=cosine_similarity, content_type='body')

Body on train


{'NDCG': 0.8538332875849136,
 'Alt_NDCG': 0.7968008388330325,
 'MAP': 0.7690100740266438}

In [25]:
print("Body on dev")
run_metrics(dev_dict, scoring_func=cosine_similarity, content_type='body')

Body on dev


{'NDCG': 0.8706960262890675,
 'Alt_NDCG': 0.8121585630178958,
 'MAP': 0.7961876798210322}

# Let's do Machine Learning

In [89]:
def glove_concat_featurizer_base(dataset_dict, query, url, content_type):
    """
        make an embedding for the query and url by concatenating their average glove vectors
    """
    combine_func = lambda docs: np.mean(docs, axis=0)

    query_words = get_query_words(dataset_dict, query)
    query_embedding = make_glove_embedding(query_words, combine_func)
    
    doc_words = get_doc_words(dataset_dict, url, content_type)
    document_embedding = make_glove_embedding(doc_words, combine_func) 

    return np.concatenate([query_embedding, document_embedding])

def glove_concat_featurizer_title(dataset_dict, query, url):
    return glove_concat_featurizer_base(dataset_dict, query, url, 'title')

def glove_concat_featurizer_header(dataset_dict, query, url):
    return glove_concat_featurizer_base(dataset_dict, query, url, 'header')

def glove_concat_featurizer_body(dataset_dict, query, url):
    return glove_concat_featurizer_base(dataset_dict, query, url, 'body')

In [90]:
def make_regression_dataset(dataset_dict, featurizer):
    """
        dataset_dict: returned by load_data
        featurizer: function that takes in (query, doc_content) pair and returns a featurization
        
        makes a dataset of (vector, relevance) pair where vector is made by the featurizer
    """
    X, y = [], []
    for query in query_iter(dataset_dict): 
        relevances = get_relevance_dict(dataset_dict, query)
        for url in url_iter(dataset_dict, query):
            embedding = featurizer(dataset_dict, query, url)
            X.append(embedding)
            y.append(relevances[url])
    return np.array(X), np.array(y)

In [105]:
def run_metrics_ml(dataset_dict, ml_model, featurizer):
    """
        dataset_dict: the dictionary returned by load_data
        ml_model: a model that takes in X and outputs y predictions
        featurizer: the featurizer function that feeds (query, document) 
            pairs to a format the ml_model can accept
    """
    # TODO: shouldnt have to use the featurizer here, need to improve this API
    ndcg_sum = 0.0
    alt_ndcg_sum = 0.0
    precision_sum = 0.0
    n = 0
    for query in query_iter(dataset_dict):
        n += 1
        query_relevance_dict = get_relevance_dict(dataset_dict, query)
        
        to_rank = [(url, featurizer(dataset_dict, query, url)) for url in url_iter(dataset_dict, query)]
        vectors = [vector for _, vector in to_rank]
        predictions = ml_model.predict(vectors)
        scores = [(url, predictions[i]) for i, (url, _) in enumerate(to_rank)]
        
        scores = sorted(scores, key = lambda x: x[1], reverse=True)
        ranked_doc_list, _ = zip(*scores)
        ranked_doc_list = list(ranked_doc_list)

        ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict)
        alt_ndcg_sum += NDCG(ranked_doc_list, query_relevance_dict, use_alt=True)
        precision_sum += average_precision(ranked_doc_list, query_relevance_dict)
        
    ndcg_sum /= n
    alt_ndcg_sum  /= n
    precision_sum /= n
    return {'NDCG': ndcg_sum,
            'Alt_NDCG': alt_ndcg_sum,
            'MAP': precision_sum,}

In [92]:
training_dataset_title = make_regression_dataset(train_dict, glove_concat_featurizer_title)
training_dataset_header = make_regression_dataset(train_dict, glove_concat_featurizer_header)
training_dataset_body = make_regression_dataset(train_dict, glove_concat_featurizer_body)

In [93]:
class LinearReg:
    def __init__(self):
        self.model = LinearRegression()
        
    def train(self, dataset):
        X, y = dataset
        self.model = self.model.fit(X, y)
        return self.model.score(X, y)
    
    def predict(self, X):
        return self.model.predict(X)     

In [94]:
print("Linear Regression with title GloVe embeddings")
lin_reg = LinearReg()
lin_reg.train(training_dataset_title)
run_metrics_ml(dev_dict, lin_reg, glove_concat_featurizer_title)

Linear Regression with title GloVe embeddings


{'NDCG': 0.8542825508514621,
 'Alt_NDCG': 0.7973529372338085,
 'MAP': 0.7603503995684292}

In [95]:
print("Linear Regression with header GloVe embeddings")
lin_reg = LinearReg()
lin_reg.train(training_dataset_header)
run_metrics_ml(dev_dict, lin_reg, glove_concat_featurizer_header)

Linear Regression with header GloVe embeddings


{'NDCG': 0.8417527545599219,
 'Alt_NDCG': 0.7796069586768716,
 'MAP': 0.7566035997732421}

In [104]:
print("Linear Regression with body GloVe embeddings")
lin_reg = LinearReg()
lin_reg.train(training_dataset_body)
run_metrics_ml(dev_dict, lin_reg, glove_concat_featurizer_body)

Linear Regression with body GloVe embeddings


{'NDCG': 0.8258499182816925,
 'Alt_NDCG': 0.7567527560828344,
 'MAP': 0.7363864125399265}

In [97]:
class NeuralNetReg:
    def __init__(self, **kwargs):
        self.model = MLPRegressor(**kwargs)
        
    def train(self, dataset):
        X, y = dataset
        self.model = self.model.fit(X, y)
        return self.model.score(X, y)
    
    def predict(self, X):
        return self.model.predict(X)     

In [99]:
print("Two layer NN with title GloVe embeddings")
nn_reg = NeuralNetReg(hidden_layer_sizes=(100,50), activation='relu', solver='adam')
nn_reg.train(training_dataset_title)
run_metrics_ml(dev_dict, nn_reg, glove_concat_featurizer_title)

Two layer NN with title GloVe embeddings


{'NDCG': 0.8616609708302518,
 'Alt_NDCG': 0.8080275458279103,
 'MAP': 0.7605624131372979}

In [100]:
print("Two layer NN with header GloVe embeddings")
nn_reg = NeuralNetReg(hidden_layer_sizes=(100,50), activation='relu', solver='adam')
nn_reg.train(training_dataset_header)
run_metrics_ml(dev_dict, nn_reg, glove_concat_featurizer_header)

Two layer NN with header GloVe embeddings


{'NDCG': 0.8497872195391367,
 'Alt_NDCG': 0.7924080698498698,
 'MAP': 0.7639138809645724}

In [103]:
print("Two layer NN with body GloVe embeddings")
nn_reg = NeuralNetReg(hidden_layer_sizes=(100,50), activation='relu', solver='adam')
nn_reg.train(training_dataset_body)
run_metrics_ml(dev_dict, nn_reg, glove_concat_featurizer_body)

Two layer NN with body GloVe embeddings


{'NDCG': 0.8536416604245739,
 'Alt_NDCG': 0.7954594654851734,
 'MAP': 0.7637178684197692}