In [None]:
import google.auth
from google.cloud import bigquery
import pandas_gbq
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from pigeon import annotate
import json
from src.utils.query_search_api import query_search_api
import regex
from scipy.stats import sem, t, beta
from scipy import mean
%matplotlib inline


In [None]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows',20)
pd.set_option('display.max_colwidth', None)

In [None]:
def create_big_query_client():
    credentials, project_id = google.auth.default()

    return bigquery.Client(
      credentials=credentials,
      project=project_id)

In [None]:
client = create_big_query_client()

In [None]:
PROJECT_DIR = os.getenv('PROJECT_DIR')

In [None]:
# Best bets available here: https://search-admin.publishing.service.gov.uk/
best_bets = pd.read_csv(PROJECT_DIR+'/data/processed/best_bets.csv')
#This isn't how best best are implement but near enough
best_bets_regex = "(" + "|".join([w.lower() for w in best_bets['query']]) + ")"

best_bets

In [None]:
#Sampe of queries over 2 years

query_sample ="""SELECT
  LOWER(hits.page.searchKeyword) as search_term
  ,CONCAT(CAST(fullVisitorId AS STRING), CAST(visitStartTime AS STRING)) AS session_id
  ,TIMESTAMP_SECONDS(visitStartTime+CAST(hits.time/1000 AS INT64)) as search_timestamp

FROM
  `govuk-bigquery-analytics.87773428.ga_sessions_*`,
  UNNEST(hits) AS hits
WHERE
  RIGHT(_table_suffix,2) = '01'
  AND hits.page.searchKeyword IS NOT NULL
  AND _table_suffix BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 24 MONTH))
    AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
    """

#search_df = pandas_gbq.read_gbq(query_sample)
#search_df.to_csv(PROJECT_DIR+'/data/processed/search_sample_2_years.csv')

search_df = pd.read_csv(PROJECT_DIR+'/data/processed/search_sample_2_years.csv')

In [None]:
# Add some useful cols to search data
search_df['date'] = search_df['search_timestamp'].map(lambda x:x[:7])
search_df['tokens'] = search_df['search_term'].map(lambda x:x.split())
search_df['query_length'] = search_df['tokens'].map(len)

In [None]:
"""
If you fancy labelling some queries according to their relvance...

If you do, you might want to consider filtering/turning off best bets prior to labelling!

#Get top n search responses for each of our sampled queries
query_sample = search_df.sample(200)
query_sample['top_response']=query_sample['search_term'].map(lambda x:query_search_api(x,5))

# Futz around with json fields
query_sample = query_sample.explode('top_response')
query_sample['description'] = sample['top_response'].map(lambda x:x.get('description',np.nan) if not isinstance(
                x, float) else [])
query_sample['id'] = sample['top_response'].map(lambda x:x.get('_id',np.nan) if not isinstance(
                x, float) else [])
query_sample['rank'] = sample['top_response'].map(lambda x:x.get('original_rank',np.nan) if not isinstance(
                x, float) else [])
query_sample= query_sample.reset_index(drop=True)


from ast import literal_eval

relevancy_judgments = annotate(query_sample.index,
                               options=['very relevant','somewhat relevant','not relevant','no good answer'],
                               display_fn=lambda idx : display(sample.loc[idx,['search_term','description','id']]))

query_sample['relevancy']= relevancy_judgments

query_sample['relevancy'] =  query_sample['relevancy'].map(lambda x: literal_eval(x) if not isinstance(x, float) else np.nan)
score = {'not relevant':0,'somewhat relevant':1,'very relevant':2,'no good answer':np.nan}
query_sample['score'] = query_sample['relevancy'].map(lambda x: score[x[1]])

query_sample.to_csv(PROJECT_DIR+'/data/processed/relevancy_judgements.csv')

"""

In [None]:
# Query results labelled according (to me!) to their relevance
relevancy_judgements = pd.read_csv(PROJECT_DIR +'/data/processed/relevancy_judgements.csv')
# Let's just remove best bets from the sample (should have removed them before labelling)
relevancy_judgements["best_bet"] = relevancy_judgements['search_term'].map(
            lambda x: bool(regex.search(best_bets_regex, x, re.IGNORECASE)) if isinstance(x, str) else False)

In [None]:
# Query length over time
avg_len=search_df[['date','query_length']].groupby('date').mean().reset_index()
plt.style.use('ggplot')
plt.figure(figsize=(10,5))
plt.plot(avg_len['date'], avg_len['query_length'], color='green')
plt.xticks(rotation=45, ha="right")
plt.ylabel("Average length")

plt.show()

In [None]:
#Query length hist (looks exponential-ish)
fig, ax = plt.subplots(1, 1)
plt.style.use('ggplot')
plt.hist(search_df['query_length'], color='green',bins=300,density=True)
plt.xlabel("Query length")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Density")
plt.xlim(1,15)
plt.xticks(np.arange(1, 15, step=1))
ax.axvline(x=np.quantile(search_df['query_length'],0.75))
fig.set_size_inches(10,5)

plt.show()

In [None]:
# Lets look at query category (as decided by me). Categories:
    # Specific service: query names (or almost names) specific service or form (regardless of whether that service is on gov.uk)
    # Specific guidance: query names specific guidance/stats doc on gov uk
    # Guidance: query references topic user presumably wants guidance about
    # Contact: query explicitly asks for contact, email, phone number, chat etc
    # Unknown: mysterious!

# If you want to do some labelling:
# annotation = annotate(df['search_term'],options=['specific_service','specific_guidance','guidance','contact','unknown'])
# query_cats =pd.DataFrame(annotation, columns=['query','category'])
# query_cats.to_csv(PROJECT_DIR+'/data/processed/query_categories.csv')
query_cats = pd.read_csv(PROJECT_DIR + '/data/processed/query_categories.csv')
query_cat=query_cats.groupby('category').size().reset_index(name='cat_count').sort_values('cat_count', ascending=False)
query_cat['Percentage']=query_cat['cat_count'] / query_cat['cat_count'].sum() *100

plt.style.use('ggplot')

plt.figure(figsize=(10,5))
plt.bar(query_cat['category'], query_cat['Percentage'], color='green')
plt.xlabel("Query category")
plt.xticks(rotation=45, ha="right")
plt.ylabel("%")

plt.show()

In [None]:
# Precision at k (k=5)
# https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_K

no_bestbets = relevancy_judgements[relevancy_judgements['best_bet']==False].dropna()
precision = no_bestbets[['query_length','session_id','search_term','score']][no_bestbets['score']>1].groupby(['search_term','session_id','query_length']).count()
precision['score'] = precision['score'] / 5
precision = precision.reset_index()

In [None]:
# Avg Precision at 5 

avg_score = precision[['query_length','score']].groupby('query_length').agg(['mean','size','sem'])
avg_score = avg_score.droplevel(0,1)
avg_score['h'] = avg_score['sem'] * t.ppf((1 + 0.95) / 2, avg_score['size'] - 1)
avg_score['lower'] = avg_score['mean'] - avg_score['h']
avg_score['upper'] = avg_score['mean'] + avg_score['h']
avg_score = avg_score.reset_index()

plt.style.use('ggplot')
plt.figure(figsize=(10,5))
fig, ax = plt.subplots(1, 1)

plt.plot(avg_score['query_length'], avg_score['mean'], color='green')
ax.fill_between(avg_score['query_length'],avg_score['lower'],avg_score['upper'] , color='b', alpha=.1)
plt.xlabel("Query length")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Average Precision @5")
fig.set_size_inches(10,5)
plt.xlim((1,5))
plt.ylim((0,1))
plt.xticks(np.arange(1, 6, step=1))
plt.show()


In [None]:
# Average max scoring doc in top 5 

vrel_rate = no_bestbets.groupby('search_term').max('score').reset_index()
vrel_rate = vrel_rate[['search_term','query_length','score']].groupby('query_length').agg(['mean','size','sem'])
vrel_rate = vrel_rate.droplevel(0,1)
vrel_rate['h'] = vrel_rate['sem'] * t.ppf((1 + 0.95) / 2, vrel_rate['size'] - 1)
vrel_rate['lower'] = vrel_rate['mean'] - vrel_rate['h']
vrel_rate['upper'] = vrel_rate['mean'] + vrel_rate['h']
vrel_rate = vrel_rate.reset_index()
vrel_rate


In [None]:
# Score vs query length
plt.style.use('ggplot')
plt.figure(figsize=(10,5))
fig, ax = plt.subplots(1, 1)

plt.plot(vrel_rate['query_length'], vrel_rate['mean'], color='green')
ax.fill_between(vrel_rate['query_length'],vrel_rate['lower'],vrel_rate['upper'] , color='b', alpha=.1)
plt.xlabel("Query length")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Average max scoring doc")
fig.set_size_inches(10,5)
plt.xlim(1,6)
plt.ylim(0,3)
plt.show()


In [None]:
# Estimate the rate of returning a very relevant result within top 5 results
# It's 45%-65%
no_bestbets['very_relevant'] = no_bestbets['score'].map(lambda x: x>1)
success_rate = no_bestbets.groupby(['session_id','search_term']).max('score').groupby('very_relevant').size().reset_index(name='count')

# Successful very relevant results
a = success_rate.iloc[1]['count']
# Not very relevant results
b = success_rate.iloc[0]['count']


fig, ax = plt.subplots(1, 1)

x = np.linspace(beta.ppf(0.01,a, b),
                beta.ppf(0.99, a, b), 100)


lower=beta.ppf(0.025, a, b, loc=0, scale=1)
upper=beta.ppf(0.975, a, b, loc=0, scale=1)
ax.plot(x, beta.pdf(x,a,b),
       'r-', lw=5, alpha=0.6, label='beta pdf')

plt.ylabel("Density")
plt.xlabel('Very relevant rate')
fig.set_size_inches(10,5)
ax.axvline(x=lower)
ax.axvline(x=upper)