# Hierarchical clustering

In [None]:
import sys
sys.path.append("..")
import src.utils.regex as regex
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan
! spacy download en_core_web_lg
import spacy

from nltk.stem.porter import PorterStemmer
STEMMER = PorterStemmer()
import nltk
from nltk.corpus import stopwords
from collections import Counter

A lot of entries contain questions about travel advice, often with individual country names
this meant the clusterer was clustering by country name which wasn't ideal
The same goes for months etc, so they are removed

In [None]:
model = spacy.load('en_core_web_lg')
def remove_common_terms(text):
    doc = model(text)
    for ent in doc.ents:
        if ent.label_ == "GPE" or ent.label_ == "DATE":
            text = text.replace(ent.text, ent.label_)
    return text

# Sanity check
remove_common_terms("to find out an update for my holiday in mexico in april")

Clean the data, there is a lot going on here, explained in the comments

In [None]:
# Read in dataset
df = pd.read_csv ("../data/raw/joined_uis_all_of_march.csv")
q3 = "Q3"
df['q3_copy'] = df[q3]

corona_slugs = open('../data/raw/coronavirus_page_slugs.txt').read().split("\n")
corona_related_items_regex = regex.coronavirus_misspellings_and_typos_regex() + '|sick pay|ssp|sick|isolation|closures|quarantine|closure|cobra|cruise|hand|isolat|older people|pandemic|school|social distancing|symptoms|cases|travel|wuhan|care|elderly|care home|carehome'

# These are terms that are functionally the same but people use different terms, this standardises them
same_terms = {
    "travelling": "travel",
    "travellers": "travel",
    "holiday": "travel",
    "self-isolation": "quarantine",
    "selfisolation": "quarantine",
    "self isolation": "quarantine",
    "isolation": "quarantine",
    "statuatory sick pay": "ssp",
    "sick pay": "ssp",
}

def clean_text(text):
    text = str(text)
    # We'll be removing non alphabetical characters but we want to keep the non emergency phone number 
    # '111' in, so we'll just replace that with text
    text = text.replace("111", "oneoneone")
    # Same for 999
    text = text.replace("999", "nineninenine")
    # Remove non alphabetical or space characters
    text = re.sub("[^a-zA-Z\s:]", "", text)
    # Use our function from previous cell
    text = remove_common_terms(text)
    # This is done after remove_common_terms because spacy doesn't 
    # always recognise country names without a capital letter at the beginning!
    text = text.lower()
    text = re.sub(regex.coronavirus_misspellings_and_typos_regex() + "|virus", "", text)
    # People using different terms for "I want to know", so just remove those
    text = re.sub("wanted to find out|to look up about|to get an update|to find infos|to find info|to find out|to understand|to read the|check on advice|to check|ti get advice|to get advice|for information on", "", text)
    for word_to_replace, word_to_replace_with in same_terms.items():
        text.replace(word_to_replace, word_to_replace_with)
    return text

df[q3] = df[q3].apply(clean_text)

# Remove rows without a page sequence
df = df[df['PageSequence'].notnull()].reset_index(drop=True)

# We only want to cluster rows that are relevant to corona stuff
# so we have the column 'has_corona_page'
# It is only true if they have visted a corona page AND included a relevant term in the feedback
# (there was some irrelevant stuff about passports, we may want to remove the need for a relevant term
# as people may be using terms not in that list and we might miss out on some insights)
for index, row in df.iterrows():
    has_corona_page = False
    if re.search(corona_related_items_regex, df.at[index, q3]) is not None:
        for slug in row['PageSequence'].split(">>"):
            if slug in corona_slugs or "coronavirus" in slug:
                has_corona_page = True
    df.at[index, 'has_corona_page'] = has_corona_page
df = df[df['has_corona_page']].reset_index(drop=True)

# Remove duplicate users
df = df.drop_duplicates('intents_clientID')

df.head()

In [None]:
def stem_tokens(tokens):
    return [STEMMER.stem(item) for item in tokens]

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens)
    return stems

In [None]:
def recursive_clustering(df, previous_cluster_level, previous_cluster_label, cluster_label):
    count = df[df[previous_cluster_label] == cluster_label].shape[0]
    # Probably no point getting more granular than this
    if count < 10:
        return df
    # Try to automatically get the a good minimum cluster size.
    # I _think_ there is a lot of potential to improve this, I've done
    # similar things before with automatically trying a few values
    # and selecting the best on some metric. This is a bit basic
    min_cluster_size = int(count / 40)
    if min_cluster_size < 2:
        min_cluster_size = 2
    vectorizer = TfidfVectorizer(tokenizer=tokenize, analyzer='word', stop_words=stopwords.words('english'), max_features=100, ngram_range=(1,3) )
    X = vectorizer.fit_transform(df[df[previous_cluster_label] == cluster_label][q3]).toarray()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,min_samples=2, cluster_selection_method='leaf')
    clusterer.fit(X)
    # _labels is the cluster they've been assigned to
    level = previous_cluster_level + 1
    new_cluster_label = f"cluster_{previous_cluster_label}_{level}_{cluster_label}"
    df[new_cluster_label] = ""
    label_index = 0
    for index, row in df[df[previous_cluster_label] == cluster_label].iterrows():
        df.at[index, new_cluster_label] = clusterer.labels_[label_index]
        # This is the probability score for this specific cluster. Can be used when presenting
        # the output, as items with the highest score belong more to that cluster, so if we sort
        # by this column and select the rows with the highest score, those will be the 'best'
        # examples for this cluster
        df.at[index, f"{new_cluster_label}_probability"] = clusterer.probabilities_[label_index]
        label_index += 1
    for label in clusterer.labels_:
        # -1 is can't be clustered 
        if label >= 0:
            df = recursive_clustering(df, level, new_cluster_label, label)
    return df
    
df["cluster_0"] = 0
df["probabilities"] = ""
df = recursive_clustering(df, 0, "cluster_0", 0)


### Output clusters

This would better be done in a tree format, which will also allow for exploration but didn't have time. This is a good example of what I'm thinking (and should be easy to implement) https://bl.ocks.org/d3noob/8375092

In [None]:
# Get all the columns with 'cluster' in the title, those are the ones generated in the last 
cluster_columns = []
for column in df.columns:
    if "cluster" in column and not "probability" in column:
        cluster_columns.append(column)

def cluster_name(column, group):
    return column.replace("cluster_", "") + str(group)
        
def get_parent_cluster_name(column, group):
    column = cluster_name(column, group)
    lengths = {}
    for other_column in cluster_columns:
        other_column = cluster_name(other_column, "")
        if other_column != column and len(other_column) < len(column):
            regex = '[' + other_column + ']'          
            matches = re.findall(other_column, column)
            if any(matches):
                lengths[other_column] = len(matches[0])
    sorted_lengths_of_matches = list({k: v for k, v in sorted(lengths.items(), reverse= True, key=lambda item: item[1])}.keys())
    if any(sorted_lengths_of_matches):
        return sorted_lengths_of_matches[0]
    else:
        return ""
    
clusters = []
for col in cluster_columns:
    unique_groups = list(set(df[col]))
    for group in unique_groups:
        this_cluster = df.copy()
        this_cluster = this_cluster[this_cluster[col] == group].reset_index(drop = True)
        if col != "cluster_0":
            df.sort_values(by=[col + '_probability'], inplace=True, ascending=False)
        if this_cluster.shape[0] >= 3:
            cluster = {}
            cluster['cluster_name'] = cluster_name(col, group)
            cluster['parent_cluster_name'] = get_parent_cluster_name(col, "")
            num_examples = this_cluster.shape[0]
            example_one = this_cluster.at[0, q3]
            example_two = this_cluster.at[1, q3]
            example_three = this_cluster.at[2, q3]
            name = f"Num entries: {num_examples}\n{example_one}\n{example_two}\n{example_three}"
            cluster['name'] = name
            cluster['num_examples'] = num_examples
            cluster['example_one'] = example_one.replace("\n", "")
            cluster['example_two'] = example_two.replace("\n", "")
            cluster['example_three'] = example_three.replace("\n", "")
            cluster['children'] = []
            clusters.append(cluster)

In [None]:
# Set up tree structure
for cluster in clusters:
    parent_cluster_name = cluster['cluster_name']
    for other_cluster in clusters:
        if other_cluster['parent_cluster_name'] == parent_cluster_name:
            cluster_is_in_children = False
            for child_cluster in cluster['children']:
                if child_cluster['cluster_name'] == other_cluster['cluster_name']:
                    cluster_is_in_children = True
            if cluster_is_in_children == False:
                cluster['children'].append(other_cluster)

Making the tree work in d3 is a good idea for the future but it was quite a lot of work and I didn't really have the time, so here is a way to spit it out as a csv

In [None]:
def recursive_addition(depth, cluster):
    commas = ""
    for i in range(depth):
        commas += ","
    title = commas + f"Cluster with {cluster['num_examples']} entries - top 3:"
    example_one = commas + cluster['example_one']
    example_two = commas + cluster['example_two']
    example_three = commas + cluster['example_three']
    row = commas + "\n"
    row += title + "\n"
    row += example_one + "\n"
    row += example_two + "\n"
    row += example_three + "\n"
    for child in cluster['children']:
        row += recursive_addition(depth + 1, child)
    return row

In [None]:
csv = ""
level = 0
level_depths = {}
for cluster in clusters:
    csv += recursive_addition(1, cluster)

text_file = open("Output.csv", "w")
text_file.write(csv)
text_file.close()