Let's try to load the query logs

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from os import path

In [18]:
pd.set_option('display.max_rows', 100)

In [2]:
# steps to create a unified logs file.
def create_unified_log():
    if (not path.exists('data/logs.csv')):
        logs1 = pd.read_csv('data/logs/Clean-Data-01.txt', sep='\t')
        logs2 = pd.read_csv('data/logs/Clean-Data-02.txt', sep='\t')
        logs3 = pd.read_csv('data/logs/Clean-Data-03.txt', sep='\t')
        logs4 = pd.read_csv('data/logs/Clean-Data-04.txt', sep='\t')
        logs5 = pd.read_csv('data/logs/Clean-Data-05.txt', sep='\t')
        frames = [logs1, logs2, logs3, logs4, logs5]
        logs = pd.concat(frames) 
        # AnonID is not a unique column, so it would not be a good idea to make it an index
        #logs['AnonID'].nunique() #238,544
        #len(logs['AnonID']) # 394,2354

        # let's save it to a csv file
        logs.to_csv('data/logs.csv', index=False)
create_unified_log()

In [3]:
logs = pd.read_csv('data/logs.csv')
logs['Query'] = logs['Query'].str.strip()
logs = logs.rename(columns={'AnonID': 'UserId'})
logs['QueryTime'] = pd.to_datetime(logs['QueryTime'], format='%Y-%m-%d %H:%M:%S')
logs.head()
# 3,942,354 queries

Unnamed: 0,UserId,Query,QueryTime
0,142,merit release appearance,2006-04-22 23:51:18
1,217,lottery,2006-03-01 11:58:51
2,217,lottery,2006-03-27 14:10:38
3,217,vietnam,2006-05-22 17:43:42
4,217,vietnam,2006-05-22 18:03:24


In [4]:
# get the longest session
'''
current_user = logs.head().iloc(0)[0]['UserId']
current_time = logs.head().iloc(0)[0]['QueryTime']
max_session_time = 0
for index, row in logs.iterrows():
    user_id = row[0]
    q_time = row[2]
    if current_user_id != user_id:
        if index-1 >= 0:
            dif_time = (logs.loc[index-1]['QueryTime'] - q_time).total_seconds()
            if (dif_time > max_session_time):
                max_session_time = dif_time
            current_user_id = user_id
            current_time = q_time
max_session_time
'''
max_session_time = 7943593  # hard code it to improve performance

In [5]:
# set the maximum frequency of occurrence of any query in QL
max_frequency = logs.groupby('Query').count()[['UserId']].sort_values('UserId', ascending=False).iloc(0)[0][0]
#max_frequency = 83677 # hard code it to improve performance
#max_frequency = logs.groupby('Query').count()
max_frequency

83677

In [35]:
# set query
query = 'rosario' #'sven' #'rosario' #'frozen' #'pear'

In [36]:
#logs[logs['Query'] == query]
logs[logs['Query'].str.startswith(query) == True] #.head(100)

Unnamed: 0,UserId,Query,QueryTime
360974,4274267,rosario flores,2006-03-13 23:45:13
736324,20660760,rosario argentina,2006-04-22 20:31:19
736325,20660760,rosario argentina tours,2006-04-23 00:42:51
736326,20660760,rosario argentina tours,2006-04-23 00:44:26
736327,20660760,rosario argentina tours,2006-04-23 00:45:21
736328,20660760,rosario argentina,2006-04-23 00:49:21
736329,20660760,rosario argentina,2006-04-23 00:53:40
1045062,3120261,rosario resort,2006-04-06 19:59:08
1457112,15313953,rosario mexico,2006-04-24 18:28:58
2103689,8789815,rosario,2006-03-29 21:10:25


In [7]:
# gets a subset from the query
queries_equal_to_q = logs[logs['Query'] == query]
arr_unique_sessions_queries = []
current_user_id = 0
# get the unique sessions for the query because users could search 2 or more times the same query immediately in the same session.
for index, row in queries_equal_to_q.iterrows():
    user_id = row[0]
    if current_user_id != user_id:
        arr_unique_sessions_queries.append(row)
        current_user_id = user_id
queries_equal_to_q = pd.DataFrame(arr_unique_sessions_queries)
queries_equal_to_q


In [8]:
# Get query candidates for suggestions, the ones which in the same session changed from query text to query text + something else
arr_candidate_queries = []
time_differences = []
# loop through queries_equal_to_q to get the real candidates 
for index, row in queries_equal_to_q.iterrows():
    user_id = row[0]
    query = row[1]
    q_time = row[2]
    current_index = index + 1
    current_user_id = logs.loc[current_index, 'UserId']
    has_next = False
    is_invalid = False
    while (user_id == current_user_id):
        # check for current query text 
        current_query = logs.loc[current_index, 'Query']
        if (not pd.isna(current_query) and current_query.startswith(query) and len(current_query) > len(query)):
            arr_candidate_queries.append(logs.loc[current_index])
            time_differences.append((logs.loc[current_index]['QueryTime'] - q_time).total_seconds())
        # move to next row    
        current_index += 1
        current_user_id = logs.loc[current_index, 'UserId']

candidate_queries = pd.DataFrame(arr_candidate_queries)
if len(candidate_queries) > 0:
    candidate_queries['Time_Dif'] =  time_differences
    candidate_queries = candidate_queries.join(candidate_queries.groupby('Query')['Time_Dif'].min(), on="Query", rsuffix="_Min")
    # only get the queries that have the min difference in time
    # and remove duplicated queries based on 'Query', 'Time_Dif'
    candidate_queries[candidate_queries['Time_Dif'] == candidate_queries['Time_Dif_Min']].drop_duplicates(subset=['Query', 'Time_Dif'])
    candidate_queries


In [9]:
# set the frequency - 𝐹𝑟𝑒𝑞(𝐶𝑄)
query_counts = logs[logs['Query'].isin(candidate_queries['Query'])].groupby('Query')['Query'].count()
query_results = candidate_queries.join(query_counts, on='Query', lsuffix='_text')
query_results = query_results.rename(columns={'Query': 'Count', 'Query_text': 'Query'})
query_results['Freq'] = query_results['Count'] / max_frequency

# set the mod - 𝑀𝑜𝑑(𝐶𝑄,𝑞′)
sessions_count = candidate_queries.groupby('Query')['Query'].count().astype(object)
query_results = query_results.join(sessions_count, on='Query', lsuffix='_text').rename(columns={'Query': 'CountInSession', 'Query_text': 'Query'})
query_results['Mod'] = query_results['CountInSession'] / len(queries_equal_to_q) 

# set the time - 𝑇𝑖𝑚𝑒(𝐶𝑄, 𝑞′)
query_results['Time'] = query_results['Time_Dif_Min'] / max_session_time

# set the min values
min_freq = query_results['Freq'].min()
min_mod = query_results['Mod'].min()
min_time = query_results['Time'].min()

# calculate scores and sort
query_results['Score'] = (query_results['Freq'] + query_results['Mod'] + query_results['Time']) / 1 - (min_freq + min_mod + min_time)
#query_results['Score'] = query_results['Score'].astype('float64')
query_results.sort_values('Score', ascending=False)

KeyError: 'Query'

In [None]:
query_results.drop_duplicates(subset=['Query', 'Score']).sort_values('Score', ascending=False).head(10)['Query'].tolist()