In [1]:
import pandas as pd
import pickle as pkl
# POT needs to be installed for the following code to work
# !pip install POT

df = pd.read_csv('data/user-ct-test-collection-02-with-query-frequencies-and-tokens.txt', sep='\t')

frequencies = pd.read_csv('data/query-frequencies-precomputed.txt', sep='\t', index_col=0)
tokens = pd.read_csv('data/query-tokens-precomputed.txt', sep='\t', index_col=0)
idf_scores = pd.read_csv('data/idf-scores-precomputed.txt', sep='\t', index_col=0)

# Dataset statistics
print('Number of rows:', df.shape[0])
print('Number of unique users:', df['AnonID'].nunique())
print('Number of unique queries:', df['Query'].nunique())
print('Query frequencies range:  [{}, {}]'.format(df['QueryFrequency'].min(), df['QueryFrequency'].max()))
print('Average query length:', df['Query'].apply(len).mean())

# User 479 queries:
df[df['AnonID'] == 479]

Number of rows: 3614377
Number of unique users: 65999
Number of unique queries: 1244495
Query frequencies range:  [1, 98554]
Average query length: 17.424875158291457


Unnamed: 0,AnonID,Query,QueryTime,ItemRank,ClickURL,QueryFrequency,QueryTokens
0,479,family guy,2006-03-01 16:01:20,,,191,"['family', 'guy']"
1,479,also sprach zarathustra,2006-03-02 14:48:55,,,1,"['also', 'sprach', 'zarathustra']"
2,479,family guy movie references,2006-03-03 22:37:46,1.0,http://www.familyguyfiles.com,1,"['family', 'guy', 'movie', 'references']"
3,479,top grossing movies of all time,2006-03-03 22:42:42,1.0,http://movieweb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
4,479,top grossing movies of all time,2006-03-03 22:42:42,2.0,http://www.imdb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
...,...,...,...,...,...,...,...
118,479,nip tuck,2006-05-28 00:44:58,4.0,http://www.niptuck.com,24,"['nip', 'tuck']"
119,479,nip tuck season 4,2006-05-28 00:47:05,,,4,"['nip', 'tuck', 'season', '4']"
120,479,nip tuck season 3 dvd,2006-05-28 00:47:48,7.0,http://en.wikipedia.org,3,"['nip', 'tuck', 'season', '3', 'dvd']"
121,479,nip tuck season 3 dvd,2006-05-28 00:47:48,9.0,http://www.dvdtimes.co.uk,3,"['nip', 'tuck', 'season', '3', 'dvd']"


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm  # Progress bar for training
from gensim.models import KeyedVectors
import gensim.downloader as api

# Tokenize queries (with tqdm progress bar)
queries = df['Query']
query_tokens = df['QueryTokens']


# Dummy model for vocabulary intersection, with same parameters as the Google model
# model = Word2Vec(sentences=[word_tokenize(query) for query in query_tokens], vector_size=300, window=5, min_count=1, workers=4)

# Load Google's pre-trained Word2Vec model
# google_model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)
google_model = api.load('word2vec-google-news-300')

# Continue training the Google model on AOL dataset
google_model.build_vocab(tqdm(query_tokens, desc='Building word2vec vocabulary'), update=True) # "update = True" extends existing vocabulary

google_model.train(tqdm(query_tokens, desc='Training word2vec model'),
            total_examples=google_model.corpus_count,
            epochs=30)

# Save the model
google_model.save('models/GoogleNews_on_AOL.model')

In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np

# Query auto-completion function
def query_completion(query, completion_list, session_queries, alpha=0.6):
    N = len(session_queries)
    gamma = beta = 1/(N+1)
    omega = 0.5

    # Frequency score (of each candidate query)
    frequency_score = [frequencies.loc[a, 'Frequency'] if a in frequencies.index else 0 for a in completion_list]

    # Semantic similarity score
    # Use average similarity score on words (average of all word pairs)
    similarity_score = []
    for candidate_query in completion_list:  # Possible completions
        candidate_score = 0

        for session_query in session_queries:   # User session queries
            score = 0

            c_tokens = word_tokenize(candidate_query)
            x_tokens = word_tokenize(session_query)

            # All word pairs
            for c in c_tokens:
                for x in x_tokens:
                    if c in google_model.key_to_index and x in google_model.key_to_index:
                        similarity = google_model.similarity(c, x)
                    else:
                        similarity = 0    # word is not in the vocabulary

                    # tf-idf weighting average:
                    if c in idf_scores.index.values and x in idf_scores.index.values:
                        # match index
                        c_idf = idf_scores.loc[idf_scores.index == c, 'IDF'].values[0]
                        x_idf = idf_scores.loc[idf_scores.index == x, 'IDF'].values[0]

                        score += (similarity * c_idf * x_idf)
                    else:
                        score += similarity

            # Average over all combinations of words:
            score /= (len(c_tokens) * len(x_tokens))

            candidate_score += score

        similarity_score.append(candidate_score)

    # Combined score
    combined_score = [alpha * similarity_score[i] + (1 - alpha) * frequency_score[i] for i in range(len(completion_list))]

    # Re-rank completion list
    re_ranked_list = [x for _, x in sorted(zip(combined_score, completion_list), reverse=True)]

    return re_ranked_list

In [None]:
# Test query auto-completion function
query = 'car'
completion_list = df[df['Query'].str.startswith(query)]['Query'].unique()  # Queries starting with 'car'
session_queries = df[(df['AnonID'] == 479) & (df['Query'].str.contains('car'))]['Query'].unique()  # Queries from user 479 containing 'car'
print(session_queries)

print("Query auto-completion for 'car':")
print(query_completion(query, completion_list, session_queries))

In [8]:
# MRR (Mean Reciprocal Rank) evaluation
def RR(ranked_completions, ground_truth):
    """
    Reciprocal Rank (RR) for one query.
    :param ranked_completions: list of suggested completions (higher rank first)
    :param ground_truth: the correct suggestion of the query
    :return: the RR score
    """
    for i, completion in enumerate(ranked_completions):
        if completion == ground_truth:
            return 1.0 / (i + 1)
    return 0.0

def MRR(completion_lists, ground_truths):
    """
    Mean of scores for entire dataset.
    """
    total_score = 0.0
    for i, completion_list in enumerate(completion_lists):
        total_score += RR(completion_list, ground_truths[i])

    return total_score / len(completion_lists)

In [24]:
# Train and test datasets (arrays of query sessions, sorted by time increasing)
with open('data/train.pkl', 'rb') as f:
    train = pkl.load(f)

with open('data/test.pkl', 'rb') as f:
    test = pkl.load(f)

# Flatten train dataset
train = [query for session in train for query in session]
train = pd.DataFrame(train, columns=['Query'])

print("Train dataset size: ", len(train), "queries")
print("Test dataset size: ", len(test), "sessions and", len([query for session in test for query in session]), "queries")
print("First 10 sessions in test dataset: ")
print(test[:10])
# Session lengths counts
print("Session lengths in test dataset: ")
pd.Series([len(session) for session in test]).value_counts()

Train dataset size:  2709614 queries
Test dataset size:  262547 sessions and 904763 queries
First 10 sessions in test dataset: 
[['-', '-', '-', '-'], ['myspace.com'], ['pet sitter in newburyport ma', 'pet sitter in newburyport ma'], ['undefined'], ['shakira lyrics'], ['ebay', 'social security'], ['glutes', 'glutes', 'glutes', 'glutes', 'glutes', 'adultfriendfinder'], ['sandals vacations'], ['www.delta.com'], ['costco']]
Session lengths in test dataset: 


1      113402
2       51078
3       28311
4       17455
5       11957
        ...  
122         1
138         1
183         1
162         1
150         1
Length: 141, dtype: int64

In [37]:
def get_completions(sessions, prefix_length, train_df, alpha=0.6):
    completion_lists = []
    ground_truths = []

    for session in tqdm(sessions, desc='Generating completions'):
        # Last query in the session is the query we want to complete
        query_ground_truth = session[-1]
        query_tokenized = word_tokenize(query_ground_truth)

        # Skip if the query is shorter than the prefix length
        if len(query_tokenized) < prefix_length:
            continue

        # Queries preceding the last query in the session
        previous_queries = session[:-1] if len(session) > 1 else []

        # Get prefix (part of the query that needs to be completed)
        query_prefix = ' '.join(query_tokenized[:prefix_length])

        # Get completions
        completions = train_df[train_df['Query'].str.startswith(query_prefix + " ")]['Query'].unique() # here we add a space after the word so we don't look for partial matches

        # Re-rank completions
        ranked_completions = query_completion(query_prefix, completions, previous_queries, alpha)

        completion_lists.append(ranked_completions)
        ground_truths.append(query_ground_truth)

    return completion_lists, ground_truths

def evaluate_test_set(test, prefix_length, alpha=0.6):
    completion_lists, ground_truths = get_completions(test, prefix_length, train, alpha)
    mrr = MRR(completion_lists, ground_truths)
    return mrr

In [None]:
# Evaluate the test set
prefix_length = 4  # query will be completed based on the first 3 words
mrr = evaluate_test_set(test, prefix_length)
print("MRR for prefix length", prefix_length, ":", mrr)

In [None]:
# Evaluate the test set
prefix_length = 3  # query will be completed based on the first 3 words
mrr = evaluate_test_set(test, prefix_length)
print("MRR for prefix length", prefix_length, ":", mrr)

In [None]:
# Evaluate the test set
prefix_length = 2  # query will be completed based on the first 2 words
mrr = evaluate_test_set(test, prefix_length)
print("MRR for prefix length", prefix_length, ":", mrr)

In [None]:
# Evaluate the test set
prefix_length = 1  # query will be completed based on the first word (VERY SLOW)
mrr = evaluate_test_set(test, prefix_length)
print("MRR for prefix length", prefix_length, ":", mrr)