In [4]:
import pandas as pd
import pickle as pkl
from nltk.tokenize import word_tokenize
from tqdm import tqdm

df = pd.read_csv('data/user-ct-test-collection-02.txt', sep='\t')

# Remove NA query rows
df = df.dropna(subset=['Query'])

# User 479 queries:
df[df['AnonID'] == 479]

Unnamed: 0,AnonID,Query,QueryTime,ItemRank,ClickURL
0,479,family guy,2006-03-01 16:01:20,,
1,479,also sprach zarathustra,2006-03-02 14:48:55,,
2,479,family guy movie references,2006-03-03 22:37:46,1.0,http://www.familyguyfiles.com
3,479,top grossing movies of all time,2006-03-03 22:42:42,1.0,http://movieweb.com
4,479,top grossing movies of all time,2006-03-03 22:42:42,2.0,http://www.imdb.com
...,...,...,...,...,...
118,479,nip tuck,2006-05-28 00:44:58,4.0,http://www.niptuck.com
119,479,nip tuck season 4,2006-05-28 00:47:05,,
120,479,nip tuck season 3 dvd,2006-05-28 00:47:48,7.0,http://en.wikipedia.org
121,479,nip tuck season 3 dvd,2006-05-28 00:47:48,9.0,http://www.dvdtimes.co.uk


In [6]:
# Precomputing frequencies of each query

frequencies = df['Query'].value_counts()
df['QueryFrequency'] = df['Query'].apply(lambda x: frequencies[x])

# df.to_csv('data/user-ct-test-collection-02-with-query-frequencies.txt', sep='\t', index=False)

# Save frequencies in a txt file:
# frequencies_df = pd.DataFrame({'Query': frequencies.index, 'Frequency': frequencies.values})
# frequencies_df.to_csv('data/query-frequencies-precomputed.txt', sep='\t', index=False)

# Highest frequency queries (top 10)
print("Top 10 queries by frequency:")
frequencies.head(10)

-                 98554
google            32396
yahoo             13344
ebay              12949
yahoo.com          8733
mapquest           8680
google.com         8139
myspace            7653
myspace.com        7099
www.google.com     4255
Name: Query, dtype: int64

In [15]:
# Precomputing tokens of each query
df['QueryTokens'] = df['Query'].apply(word_tokenize)

# df.to_csv('data/user-ct-test-collection-02-with-query-frequencies-and-tokens.txt', sep='\t', index=False)

# Save tokens in a txt file:
# tokens_df = pd.DataFrame({'Query': df['Query'], 'Tokens': df['QueryTokens']})
# tokens_df.to_csv('data/query-tokens-precomputed.txt', sep='\t', index=False)

df[df['AnonID'] == 479]

Unnamed: 0,AnonID,Query,QueryTime,ItemRank,ClickURL,QueryFrequency,QueryTokens
0,479,family guy,2006-03-01 16:01:20,,,191,"['family', 'guy']"
1,479,also sprach zarathustra,2006-03-02 14:48:55,,,1,"['also', 'sprach', 'zarathustra']"
2,479,family guy movie references,2006-03-03 22:37:46,1.0,http://www.familyguyfiles.com,1,"['family', 'guy', 'movie', 'references']"
3,479,top grossing movies of all time,2006-03-03 22:42:42,1.0,http://movieweb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
4,479,top grossing movies of all time,2006-03-03 22:42:42,2.0,http://www.imdb.com,2,"['top', 'grossing', 'movies', 'of', 'all', 'ti..."
...,...,...,...,...,...,...,...
118,479,nip tuck,2006-05-28 00:44:58,4.0,http://www.niptuck.com,24,"['nip', 'tuck']"
119,479,nip tuck season 4,2006-05-28 00:47:05,,,4,"['nip', 'tuck', 'season', '4']"
120,479,nip tuck season 3 dvd,2006-05-28 00:47:48,7.0,http://en.wikipedia.org,3,"['nip', 'tuck', 'season', '3', 'dvd']"
121,479,nip tuck season 3 dvd,2006-05-28 00:47:48,9.0,http://www.dvdtimes.co.uk,3,"['nip', 'tuck', 'season', '3', 'dvd']"


In [13]:
# Creating sessions of queries submitted within 30 min (sorted by timestamp, ascending)

# Convert timestamp to datetime
df['QueryTime'] = pd.to_datetime(df['QueryTime'])

# Sort queries by timestamp for each user
df = df.sort_values(['AnonID', 'QueryTime'])

# Group queries by user
grouped = df.groupby('AnonID')

# Create sessions of queries submitted within 30 min
# Last query in a session array is the most recent query (we will use it for auto-completion)
sessions = []

for user, group in tqdm(grouped, desc='Creating sessions'):
    session = []
    prev_time = group['QueryTime'].iloc[0]

    for i, row in group.iterrows():
        if (row['QueryTime'] - prev_time).seconds / 60 <= 30:
            session.append(row['Query'])
        else:
            sessions.append(session)
            session = [row['Query']]

        prev_time = row['QueryTime']

    sessions.append(session)

# with open('data/user-sessions.pkl', 'wb') as f:
#     pkl.dump(sessions, f)

print("First 10 sessions:")
for i in range(10):
    print(sessions[i])

First 10 sessions:
['family guy']
['also sprach zarathustra']
['family guy movie references', 'top grossing movies of all time', 'top grossing movies of all time']
['car decals', 'car decals', 'car decals', 'car window decals', 'car window sponsor decals', 'car sponsor decals', 'car brand name decals', 'brand name decals', 'bose', 'bose car decal', 'bose car decal', 'bose car decal']
['chicago the mix', 'chicago the drive', 'chicago radio annoucer whip', 'chicago radio whip', 'chicago radio brian the whipping boy']
['emma watson']
['stanford encyclopedia of philosophy', 'internet encyclopedia of philosophy', 'www library philosophy', 'allegory of the cave', 'allegory of the cave', 'allegory of the cave']
['citation machine']
['howard stern lawsuit']
['sirius playboy']


In [None]:
# Split into train and test sets (75% train, 25% test)
from sklearn.model_selection import train_test_split

train, test = train_test_split(sessions, test_size=0.25, random_state=42)

# Save train and test sets
# with open('data/train.pkl', 'wb') as f:
#     pkl.dump(train, f)
#
# with open('data/test.pkl', 'wb') as f:
#     pkl.dump(test, f)

In [16]:
# Tf-idf scores of all words in the queries
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer
X = vectorizer.fit_transform(df['QueryTokens'])

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Create a dataframe with the feature names and their corresponding idf scores
idf_scores = pd.DataFrame({'Word': feature_names, 'IDF': vectorizer.idf_})

# Save the idf scores
# idf_scores.to_csv('data/idf-scores-precomputed.txt', sep='\t', index=False)

idf_scores

Unnamed: 0,Word,IDF
0,00,10.427107
1,000,10.697753
2,0000,13.797845
3,00000,15.001818
4,000000,11.381931
...,...,...
454167,öóêåíãøùçõú,15.407283
454168,ùù,15.407283
454169,úãöá,15.407283
454170,úä,15.407283
