In [57]:
import pandas as pd
import pickle as pkl
# POT needs to be installed for the following code to work
# !pip install POT

df = pd.read_csv('data/user-ct-test-collection-02-with-query-frequencies-and-tokens.txt', sep='\t')

frequencies = pd.read_csv('data/query-frequencies-precomputed.txt', sep='\t', index_col=0)
tokens = pd.read_csv('data/query-tokens-precomputed.txt', sep='\t', index_col=0)
idf_scores = pd.read_csv('data/idf-scores-precomputed.txt', sep='\t', index_col=0)

# Dataset statistics
print('Number of rows:', df.shape[0])
print('Number of unique users:', df['AnonID'].nunique())
print('Number of unique queries:', df['Query'].nunique())
print('Query frequencies range:  [{}, {}]'.format(df['QueryFrequency'].min(), df['QueryFrequency'].max()))
print('Average query length:', df['Query'].apply(len).mean())

# User 479 queries:
df[df['AnonID'] == 479]

Number of rows: 3614377
Number of unique users: 65999
Number of unique queries: 1244495
Query frequencies range:  [1, 98554]
Average query length: 17.424875158291457


Unnamed: 0,AnonID,Query,QueryTime,ItemRank,ClickURL,QueryFrequency,QueryTokens
0,479,family guy,2006-03-01 16:01:20,,,191,"['family', 'guy']"
1,479,also sprach zarathustra,2006-03-02 14:48:55,,,1,"['also', 'sprach', 'zarathustra']"
2,479,family guy movie references,2006-03-03 22:37:46,1.0,http://www.familyguyfiles.com,1,"['family', 'guy', 'movie', 'references']"
3,479,top grossing movies of all time,2006-03-03 22:42:42,1.0,http://movieweb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
4,479,top grossing movies of all time,2006-03-03 22:42:42,2.0,http://www.imdb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
...,...,...,...,...,...,...,...
118,479,nip tuck,2006-05-28 00:44:58,4.0,http://www.niptuck.com,24,"['nip', 'tuck']"
119,479,nip tuck season 4,2006-05-28 00:47:05,,,4,"['nip', 'tuck', 'season', '4']"
120,479,nip tuck season 3 dvd,2006-05-28 00:47:48,7.0,http://en.wikipedia.org,3,"['nip', 'tuck', 'season', '3', 'dvd']"
121,479,nip tuck season 3 dvd,2006-05-28 00:47:48,9.0,http://www.dvdtimes.co.uk,3,"['nip', 'tuck', 'season', '3', 'dvd']"


In [2]:
# Most frequent users and their query counts
print("Top 10 users by query count")
print(df['AnonID'].value_counts().head(10))

print("-" * 50)

# Most frequent queries
print("Top 10 queries by frequency")
print(df['Query'].value_counts().head(10))

Top 10 users by query count
3318459    6925
205414     4663
422471     4198
2426641    4106
3717968    3429
3134676    3199
1611540    2715
901695     2651
2037028    2520
2067984    2294
Name: AnonID, dtype: int64
--------------------------------------------------
Top 10 queries by frequency
-                 98554
google            32396
yahoo             13344
ebay              12949
yahoo.com          8733
mapquest           8680
google.com         8139
myspace            7653
myspace.com        7099
www.google.com     4255
Name: Query, dtype: int64


In [3]:
# Query completion, based on the most frequent queries starting with the given query
def query_completion(query):
    starts_with_query = df['Query'].str.startswith(query)

    return df[starts_with_query].sort_values('QueryFrequency', ascending=False)['Query'].unique()[:10]

print("10 top completions for 'weather':")
pd.DataFrame(query_completion('weather'), columns=[''])

10 top completions for 'weather':


Unnamed: 0,Unnamed: 1
0,weather
1,weather.com
2,weather channel
3,weatherchannel.com
4,weather forecast
5,weather bug
6,weatherbug
7,weather in lattimore
8,weather channel.com
9,weather facts


### Query Auto-Completion Based on Word2vec Semantic Similarity
Considers semantic similarity between the candidate queries and their previous queries submitted in the same session, on the basis of word2vec method.


In [9]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm  # Progress bar for training

# Tokenize queries (with tqdm progress bar)
queries = df['Query']
query_tokens = df['QueryTokens']

# Initialize word2vec model
AOL_model = Word2Vec(vector_size=200,   # Dimension of the word vectors
                 window=5,          # Context window size
                 min_count=1,       # Minimum word frequency
                 workers=4,         # Number of parallel workers
                 sg=1)              # Skip-gram model

# Train word2vec model
# AOL_model.build_vocab(tqdm(query_tokens, desc='Building word2vec vocabulary'))
#
# AOL_model.train(tqdm(query_tokens, desc='Training word2vec model'),
#             total_examples=AOL_model.corpus_count,
#             epochs=30)
#
# # Save word2vec model
# AOL_model.save('word2vec_AOL.model')

In [16]:
from gensim.models import KeyedVectors

# Load Google's pre-trained Word2Vec model
google_model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)

# Load the trained AOL word2vec model
AOL_model = Word2Vec.load('models/word2vec_AOL.model')

# Can also use:
# import gensim.downloader as api
# model = api.load('word2vec-google-news-300')

In [None]:
# Input: AOL dataset: Q; Completion List: L; Current Session Queries List: C;
# Output: Frequency Score List: F; Semantic Similarity Score List: S; Re-ranked List: R;
# Pseudocode:
# for each a  L do
# ScoreF (a) = count(a in Q); F.append(ScoreF (a))
# end for
# model.load(Google News.model)
# for each a  L do
# for each b  C do
# ScoreS(a)+ = model.similarity(b, a)
# end for
# S.append(ScoreS(a));
# end for
# ScoreFS(a) = ScoreS(a) + (1 - )ScoreF (a)
# dic ={ a  L : ScoreFS(a)}
# R = L.sorted(dic; key = dic.getitem; reserve = True)
# return Re-ranked List: R;

In [51]:
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np


# Query auto-completion function
def query_completion(query, completion_list, session_queries, alpha=0.5):
    N = len(session_queries)
    gamma = beta = 1/(N+1)
    omega = 0.5

    # Frequency score (of each candidate query)
    frequency_score = [frequencies.loc[a, 'Frequency'] if a in frequencies.index else 0 for a in completion_list]

    # Semantic similarity score
    # Use average similarity score on words (average of all word pairs)
    similarity_score = []
    for candidate_query in tqdm(completion_list, desc='Calculating semantic similarity score'):
        score = 0
        for session_query in session_queries:
            google_model_score = 0
            AOL_model_score = 0

            c_tokens = word_tokenize(candidate_query)
            x_tokens = word_tokenize(session_query)

            # All word pairs
            for c in c_tokens:
                for x in x_tokens:
                    if c in google_model.key_to_index and x in google_model.key_to_index:
                        google_model_similarity = google_model.similarity(c, x)
                    else:
                        google_model_similarity = 0
                    if c in AOL_model.wv.key_to_index and x in AOL_model.wv.key_to_index:
                        AOL_model_similarity = AOL_model.wv.similarity(c, x)
                    else:
                        AOL_model_similarity = 0

                    # tf-idf weighting average:
                    if c in idf_scores.index.values and x in idf_scores.index.values:
                        # match index
                        c_idf = idf_scores.loc[idf_scores.index == c, 'IDF'].values[0]
                        x_idf = idf_scores.loc[idf_scores.index == x, 'IDF'].values[0]

                        google_model_score += (google_model_similarity * c_idf * x_idf)
                        AOL_model_score += (AOL_model_similarity * c_idf * x_idf)
                    else:
                        google_model_score += google_model_similarity
                        AOL_model_score += AOL_model_similarity

                    google_model_score += google_model_similarity
                    AOL_model_score += AOL_model_similarity

            # Average over all combinations of words:
            google_model_score /= (len(c_tokens) * len(x_tokens))
            AOL_model_score /= (len(c_tokens) * len(x_tokens))

            score += omega * AOL_model_score + (1 - omega) * google_model_score

        similarity_score.append(score)

    # TODO: Average of Word2Vec vectors with TF-IDF weights.

    # Combined score
    combined_score = [alpha * similarity_score[i] + (1 - alpha) * frequency_score[i] for i in range(len(completion_list))]

    # Re-rank completion list
    re_ranked_list = [x for _, x in sorted(zip(combined_score, completion_list), reverse=True)]

    return re_ranked_list

In [52]:
# Test query auto-completion function
query = 'car'
completion_list = df[df['Query'].str.startswith(query)]['Query'].unique()  # Queries starting with 'car'
session_queries = df[(df['AnonID'] == 479) & (df['Query'].str.contains('car'))]['Query'].unique()  # Queries from user 479 containing 'car'
print(session_queries)

print("Query auto-completion for 'car':")
print(query_completion(query, completion_list, session_queries))

['car decals' 'car window decals' 'car window sponsor decals'
 'car sponsor decals' 'car brand name decals' 'bose car decal']
Query auto-completion for 'car':


  if c in idf_scores.index.values and x in idf_scores.index.values:
Calculating semantic similarity score: 100%|██████████| 6236/6236 [00:12<00:00, 481.80it/s]

['cartoon network', 'cartoonnetwork.com', 'cars', 'cartoonnetwork', 'carmen electra', 'car rental', 'carmax', 'car rentals', 'careerbuilder.com', 'cars.com', 'career builder', 'cartoons', 'car buyers market', 'carmax.com', 'carmenelectra', 'careerbuilder', 'carnival cruise', 'cartoon', 'cards', 'carrie underwood', 'cartoon network.com', 'car', 'caroline kennedy', 'cartoonetwork', 'carnival cruise lines', 'carnival cruises', 'carowinds', 'care of the post operative patient', 'car max', 'carlos maldonado', 'carfax', 'cartoonnetwork toonami', 'car insurance', 'care bears', 'cartoon sex', 'carnaval ameca', 'card games', 'carpenter ants', 'car decals', 'carole king', 'car audio', 'cartoon dogs', 'cartoonetwork.com', 'caroline kaplan', 'caremark', 'cartoon frogs', 'carmen garcia', 'care of the post surgical patient', 'carfax.com', 'career builders', 'carpenter bees', 'carpal tunnel', 'car parts', 'cartoon.com', 'car search', 'carnival cruise line', 'cars for sale', 'career builder.com', 'car




In [21]:
# Test word2vec similarity with wmdistance, higher score means less similar
doc1 = word_tokenize("car")
doc2 = word_tokenize("cartoon network")
google_model.wmdistance(doc1, doc2)
AOL_model.wv.wmdistance(doc1, doc2)

KeyboardInterrupt: 

In [66]:
# Test idf scores for words
print(idf_scores.loc[idf_scores.index == 'car', 'IDF'])
print(idf_scores.loc[idf_scores.index == 'the', 'IDF'])
print(idf_scores.loc[idf_scores.index == 'what', 'IDF'])

Word
car    6.902871
Name: IDF, dtype: float64
Word
the    4.806806
Name: IDF, dtype: float64
Word
what    6.676028
Name: IDF, dtype: float64
