In [None]:
import google.auth
from google.cloud import bigquery
import pandas_gbq
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import json
from py2neo import Graph
import os
from ast import literal_eval
import json
from py2neo import Graph
from collections import Counter
import nltk
from nltk.collocations import *
import string
from scipy.stats import entropy
from nltk.tokenize import MWETokenizer

In [None]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
PROJECT_DIR = os.getenv('PROJECT_DIR')

In [None]:
PROJECT_DIR

In [None]:
# Connect to KG
graph = Graph(host='knowledge-graph.integration.govuk.digital',auth=(os.getenv('NEO_USER'),os.getenv('NEO_PASSWORD')), secure=True)

In [None]:
# Connect to BQ
# Need to have active environment variable called GOOGLE_APPLICATION_CREDENTIALS pointing to json file with
# bigquery credentials
def create_big_query_client():
    credentials, project_id = google.auth.default()
    return bigquery.Client(
      credentials=credentials,
      project=project_id)

client = create_big_query_client()

In [None]:
# Gets every page view prior to a search in a session. Multiple search queries per session are grouped together in a list
# Over a couple days its a few gigs, but the more data the better obvs
# e.g see below
"""
session_id | viewed_page | search_terms 
    123    |    /mot     |  mot, mot check
    123    | /check-mot  |  mot, mot check 
"""

query = """
SELECT
    country,
    region,
    metro,
    action.session_id,
    viewedpages.pageTitle,
    viewedpages.pagePath,
    first_search_timestamp,
    search_terms,
    pageview_timestamp,
    ROW_NUMBER() OVER (PARTITION BY action.session_id ORDER BY pageview_timestamp DESC) as hit_n
    FROM (
      SELECT
          geoNetwork.country,
          geoNetwork.region,
          geoNetwork.metro,
          CONCAT(CAST(fullVisitorId AS STRING), CAST(visitId AS STRING)) AS session_id,
          string_agg(LOWER(hits.page.searchKeyword)) as search_terms,
          MIN(TIMESTAMP_SECONDS(visitStartTime+CAST(hits.time/1000 AS INT64))) as first_search_timestamp
          FROM
          `govuk-bigquery-analytics.87773428.ga_sessions_*`,
          UNNEST(hits) AS hits
          WHERE
          _table_suffix BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY))
              AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
          AND hits.page.searchKeyword IS NOT NULL
          GROUP BY session_id, geoNetwork.region,geoNetwork.metro, geoNetwork.country
      ) AS action
LEFT JOIN (
    SELECT
        CONCAT(CAST(fullVisitorId AS STRING), CAST(visitId AS STRING)) AS session_id,
        hits.page.pageTitle as pageTitle,
        hits.page.pagePath as pagePath,
        TIMESTAMP_SECONDS(visitStartTime+CAST(hits.time/1000 AS INT64)) as pageview_timestamp,
        FROM
        `govuk-bigquery-analytics.87773428.ga_sessions_*` 
        CROSS JOIN UNNEST(hits) AS hits
        WHERE _table_suffix BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY))
          AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)) 
        AND hits.type = "PAGE"
      ) as viewedpages
ON viewedpages.session_id = action.session_id
WHERE pageTitle is not null
AND pageview_timestamp < first_search_timestamp
"""

In [None]:
#queries_df = pandas_gbq.read_gbq(query)
#queries_df.to_csv(PROCESSED_DATA+'/pagehistoryqueries.csv')
queries_df = pd.read_csv(PROJECT_DIR+'/data/processed/pagehistoryqueries.csv')


In [None]:
# Query + session data
queries_df.head(10)

In [None]:
# Taxons for every bit of content 
#taxons_df =graph.run("MATCH (c:Cid)-[r:IS_TAGGED_TO]->(t:Taxon)RETURN c.name as pagePath,t.name as taxon").to_data_frame()
#taxons_df.to_csv(PROCESSED_DATA+'/taxons.csv')
taxons_df =pd.read_csv(PROJECT_DIR+'/data/processed/taxons.csv')


In [None]:
# inner join taxons to session/query data 
# We'll end up with multiple taxons per query 
# ie each query has potentially many pages viewed prior to the query being made AND pages can have multiple taxons

query_taxons = queries_df.merge(taxons_df,on='pagePath')
# Split queries up (they're stupidily aggregated in the sql)
query_taxons['query']=query_taxons['search_terms'].map(lambda x: x.split(','))
query_taxons = query_taxons.explode('query')
# Average number of taxons visited per query (around 7)
np.mean(query_taxons.groupby('session_id').size())

In [None]:
# Instead of bag of words, let's try bag of entities
entities = pd.read_csv(PROJECT_DIR+'/data/processed/content_entities.csv')

In [None]:
# Some multiword tokens


entities['tokens'] = entities['name'].map(lambda x: tuple(x.split()))
tokenizer = MWETokenizer(list(set(entities['tokens'])))
tokenizer.add_mwe([('log', 'in'), ('sign', 'in'), ('sign', 'up')])
entities['entity'] = entities['tokens'].map(lambda x: tokenizer.tokenize(x))
entities

In [None]:
entity_set = set(entities.explode('entity')['entity'])
entity_set

In [None]:
# Text preprocessing
translator = str.maketrans('', '', string.punctuation)

# Tokenise query, explode, and regroup by taxon 
# End up with a list of tokens per taxon
query_taxons['tokens'] = query_taxons['query'].map(lambda x: tokenizer.tokenize(x.translate(translator).split()))
query_taxons['entities'] = query_taxons['tokens'].map(lambda x: [token for token in x if token in entity_set])
query_taxons = query_taxons.explode('entities')
query_taxons = query_taxons[['taxon','entities','tokens']]
query_taxons = query_taxons.dropna()
# Group query tokens by taxon so we have a list of tokens per taxon
query_taxons = query_taxons.groupby('taxon').aggregate(lambda x: list(x)).reset_index()

In [None]:
title_df = pd.read_csv(PROJECT_DIR+'/data/processed/mainstreamcontent.csv')
title_df = title_df.dropna()
title_df['body_text'] = title_df['text'] + (title_df['title'] + ' ') * 5
title_df['body_tokens'] = title_df['body_text'].map(lambda x: tokenizer.tokenize(x.lower().translate(translator).split()))
title_df['entities']  = title_df['body_tokens'].map(lambda x: [entity for entity in x if entity in entity_set])
title_df['entity_count'] = title_df['entities'].map(Counter)

In [None]:
query_taxons['entity_count'] = query_taxons['entities'].map(Counter)
query_boe = pd.DataFrame.from_records(query_taxons['entity_count'],index=query_taxons['taxon'])
query_boe = query_boe.replace(np.nan,0) 
query_dis = query_boe.divide(query_boe.sum(1),0)

In [None]:
# bag of words for mainstream content
service_boe = pd.DataFrame.from_records(title_df['entity_count'],index=title_df['name'],columns=query_boe.columns)
service_boe = service_boe.replace(np.nan,0)
#add one smoothing
service_boe = service_boe +1
service_dis = service_boe.divide(service_boe.sum(1),0)
service_dis.shape

In [None]:
assert(query_dis.shape[1]==service_dis.shape[1])


In [None]:
# Find min KL divergence content to taxon query terms
# https://en.wikipedia.org/wiki/Information_projection
# Add one smoothing, which is p, which is q, all make quite a big difference in performance
# p is our normalised bag of words from each taxon query
# q is every bit of mainstream content normalised bag of words
# e.g try 'Blue badges', 'Afghanistan' etc
TAXON = "Stopping or selling your business"
idx = np.argsort(entropy(np.broadcast_to(query_dis.loc[TAXON].to_numpy(),(service_dis.shape[0],service_dis.shape[1])),service_dis,axis=1))[0:15]
service_boe.iloc[idx].index