In [1]:
# !pip3 install wordcloud
# !pip3 install polyglot
# !pip3 install pyicu
# !pip3 install pycld2
# !pip3 install morfessor
# !pip3 install polyglot
# !pip3 install fuzzywuzzy
# !pip3 install seaborn

In [1]:
import os
import pandas as pd
import sys
import numpy as np 
import spacy
import nltk
from nltk import word_tokenize, sent_tokenize, RegexpParser, tree
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process

from tqdm import tqdm_notebook, tqdm
from collections import Counter
import re
import operator
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from ast import literal_eval

import string 

## https://markhneedham.com/blog/2017/11/28/python-polyglot-modulenotfounderror-no-module-named-icu/
from polyglot.detect import Detector

tqdm.pandas()

  from pandas import Panel


## Load in the survey data

In [2]:
DATA_DIR = "../../data"

survey_filename = os.path.join(DATA_DIR, "uis_20200401_20200409.csv")
df = pd.read_csv(survey_filename)

### Some row duplication present

In [3]:
print(f"rows: {df.shape[0]}\nunique clientIds: {df.intents_clientID.nunique()}")
print(f"unique primary key: {df.primary_key.nunique()}\nunique session_ids: {df.session_id.nunique()}\n")
# print(df.columns)
print(df[df.session_id.isna()].shape)

rows: 61064
unique clientIds: 8030
unique primary key: 10613
unique session_ids: 14062

(2970, 78)


In [4]:
## the closer these numbers are to # unique primary_key, the better
df.Q3_y.nunique(), df.Q3_x.nunique()

(9601, 9601)

In [5]:
df.drop_duplicates("primary_key", inplace = True)
df.reset_index(inplace=True, drop=True)

## Functions for sentence tokenization, part of speech tagging, PII placeholder stripping, ngram computation

In [6]:
nlp = spacy.load("en_core_web_sm")

pii_filtered = ["DATE_OF_BIRTH", "EMAIL_ADDRESS", "PASSPORT", "PERSON_NAME", 
                "PHONE_NUMBER", "STREET_ADDRESS", "UK_NATIONAL_INSURANCE_NUMBER", "UK_PASSPORT"]
pii_regex = "|".join([f"\\[{p}\\]" for p in pii_filtered])
pii_regex

'\\[DATE_OF_BIRTH\\]|\\[EMAIL_ADDRESS\\]|\\[PASSPORT\\]|\\[PERSON_NAME\\]|\\[PHONE_NUMBER\\]|\\[STREET_ADDRESS\\]|\\[UK_NATIONAL_INSURANCE_NUMBER\\]|\\[UK_PASSPORT\\]'

In [8]:
stop_words = list(stopwords.words('english'))
punctuation = list(string.punctuation) + ['’']
token_blacklist = stop_words + punctuation + pii_filtered

def split_sentences(comment):
    return nltk.sent_tokenize(comment)

def remove_stopwords_punctation(sentences):
    return [[(t[0], t[1], t[2]) for t in sent if t[0].lower() not in token_blacklist] for sent in sentences]

def replace_pii_regex(text):
    return re.sub(pii_regex, "", text)

def part_of_speech_tag(comment):
    sentences = split_sentences(comment)
    return [[(token.text, token.tag_, token.lemma_) for token in nlp(sentence)] for sentence in sentences]

In [10]:
t = "This is a test with punctuation’. this is another sentence."
processed_t = part_of_speech_tag(t)
processed_t

[[('This', 'DT', 'this'),
  ('is', 'VBZ', 'be'),
  ('a', 'DT', 'a'),
  ('test', 'NN', 'test'),
  ('with', 'IN', 'with'),
  ('punctuation', 'NN', 'punctuation'),
  ('’', "''", "'"),
  ('.', '.', '.')],
 [('this', 'DT', 'this'),
  ('is', 'VBZ', 'be'),
  ('another', 'DT', 'another'),
  ('sentence', 'NN', 'sentence'),
  ('.', '.', '.')]]

## Detect feedback language
There is a bit of foreign language spam in some responses, detect non (primarily) english comments and drop

In [11]:
def detect_language(text):
    if text!="-":
        try:
            langs = {language.confidence:language.code for language in Detector(text, quiet=True).languages}
            return langs[max(langs.keys())]
        except:
            return f"[ERROR] {text}"
    return "-"

In [12]:
df['Q3_pii_removed'] = df['Q3_x'].progress_map(replace_pii_regex)
df = df[(df.Q3_pii_removed.str.len()<4000)]
df['language'] = df['Q3_pii_removed'].progress_map(detect_language)

100%|██████████| 10613/10613 [00:00<00:00, 293842.16it/s]
  0%|          | 0/10600 [00:00<?, ?it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the la

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

 71%|███████   | 7484/10600 [00:00<00:00, 10211.71it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to det

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
100%|██████████| 10600/10600 [00:00<00:00, 10620.06it/s]


In [13]:
lang_dist = df['language'].value_counts().to_dict()
print(f"Number of unique languages: {len(lang_dist)}")
print(f"English: {round((lang_dist['en']*100)/sum(lang_dist.values()), 2)}%")
print(f"-: {round((lang_dist['-']*100)/sum(lang_dist.values()), 2)}%")
list(lang_dist.items())[0:10]

Number of unique languages: 44
English: 90.58%
-: 8.3%


[('en', 9601),
 ('-', 880),
 ('un', 23),
 ('xh', 12),
 ('da', 10),
 ('gv', 9),
 ('gd', 7),
 ('it', 5),
 ('pl', 4),
 ('mg', 4)]

In [14]:
df['is_en'] = df['language'].isin(["en", "un", "-", "sco"])

In [15]:
df = df[df['is_en']]
df.shape

(10506, 81)

### Part of speech tag
Run this the first time and save, then just load df

In [16]:
df['pos_tag'] = df[['Q3_pii_removed', 'is_en']].progress_apply(lambda x: part_of_speech_tag(x[0]) 
                                                     if x[1] else [], axis=1)
df['lemmas'] = df['pos_tag'].progress_map(lambda x: [token[2] for sent in x for token in sent])

df['words'] = df['pos_tag'].progress_map(lambda x: [token[0] for sent in x for token in sent])

df.to_csv(os.path.join(DATA_DIR, "uis_20200401_20200409_lang_pos.csv"), index=False)
df = pd.read_csv(os.path.join(DATA_DIR, "uis_20200401_20200409_lang_pos.csv"))
df['pos_tag'] = df['pos_tag'].map(literal_eval)

100%|██████████| 10506/10506 [02:26<00:00, 71.59it/s]
100%|██████████| 10506/10506 [00:00<00:00, 66894.11it/s]
100%|██████████| 10506/10506 [00:00<00:00, 157048.16it/s]


## Extract noun and verb phrases

In [17]:
part_of_speech_tag(df.Q3_pii_removed.iloc[0])[0][0]

('-', ':', '-')

In [18]:
grammar = r"""
    cc:
    {<CC>}
    pronoun:
    {<DT><IN><PRP>}
    {<IN>?<PRP>}
    noun_verb:
    {<IN>?<JJ.*>*<NN.*>+<HYPH>?<VBD|VBN|VBG><NN.*>*}
    verb:
    {<IN|TO>*<VB.*><IN|RP|TO>?<RB|WRB|TO>*<IN|TO>*}
    {<MD><VB.*><RB><IN>+}
    {<WRB|WP><TO>?<VB.*>+}
    {<TO><VB><IN>}
    {<TO>?<VB.*><IN|RP>?<WRB|RP|WP>?<IN|TO>?<VB.*>?}
    {<VB.*><TO><VB.*><RB>*<TO>?}
    {<IN><EX><VB.*>}
    {<RB><TO><VB.*>+}
    {<TO>?<VB.*><IN|WDT|WP|RP>}
    {<WP><VB.*>}
    {<VB.*><RB.*>+<VB.*>*<IN|TO>?}
    {<WDT>?<TO>?<MD|VB.*>?<RB>?<TO|IN>?<V.*>+<CC>?<V.*>*<IN|RP>?<IN>*}
    {<MD><RB>*<VB.*>*}
    {<VB.*><IN|TO><IN>*}
    {<TO><VB.*><IN>*}
    {<VB.*>}
    prep_noun:
    {(<CD><IN><DT>)?<IN><JJ.*>*<NN.*>}
    {<IN><NN.*><JJ.*>?<NN.*>+}
    {<IN><NN.*><HYPH>?<NN.*>*}
    {<IN>+<PRP\$>?<NN><CD>?}
    {<IN><CD><.*>}
    {<RB><RBS>?<CD|JJ.*>?<NN.*>+}
    {<RP>?<IN>+<JJ.*>*<NN.*>+}
    {<IN><DT><NN.*><JJ.*>*<NN><HYPH>?<NN>}
    {<IN><NN.*>(<HYPH>?<NN.*>)?}
    {<JJ.*>*<IN><DT>?<NN.*>+<CD>?<NN.*>?}
    {<IN>+<DT>*<JJ>?<CD>?<NN.*>+<CD>?<NN.*>?}
    noun:
    {<CD><NN.*>}
    {<PDT><PRP\$><NN.*>+}
    {<RB><DT>?<JJ.*>*<NN.*>}
    {<DT><HYPH>?<NN.*>}
    {<JJ.*><NN.*>*<CD>}
    {<NN.*><CD><JJ.*>?}
    {<JJ.*|NN.*><IN|TO><PRP>}
    {<CD><NN.*><JJ.*>}
    {<WRB><RB><JJ.*>*<NN.*>*}
    {<DT><JJ.*>*<NN.*>+}
    {<NN.*><CD>?<JJ.*>*<NN.*>*}
    {<IN>+<CD>*<POS>*<IN>*<NN.*>}
    {<IN><PRP\$>?<JJ.*>*<NN.*>}
    {<NN.*><HYPH><NN.*>}
    {<DT>?<CD>?<JJ.*>?<CC>?<JJ.*>?<NN.*>+}
    {<NN.*><HYPH>?<NN.*|JJ.*|VB.*>*}
    {(<NN|NNS>|<NNP|NNPS>)<NNP|NN|NNS|NNPS>+}
    {(<NN|NNS>+|<NNP|NNPS>+)<IN|CC>(<PRP\$|DT><NN|NNS>+|<NNP|NNPS>+)}
    {<JJ|RB|CD>*<NNP|NN|NNS|NNPS>+}
    {<NNP|NN|NNS|NNPS>+}
    {<CD><IN>?<NN.*>}
    adjective:
    {<RB>*<JJ.*><CD>?}
    rb:
    {<RB>+}
    punct:
    {<-RRB->|<-LRB->|<,>|<.>}
    """

class Chunk:

    def __init__(self, label, tokens, indices):
        self.label = label
        self.tokens = tokens
        self.indices = indices
        self.text = self.text()
        self.lemma = self.lemma()
        self.important_lemma = self.important_lemma()
        self.important_word = self.important_word()

    def text(self):
        return " ".join([w for w,  _ , _  in self.tokens])
    
    def lemma(self):
        return " ".join([l for _,  _ , l  in self.tokens])
    
    def tagable_words(self):
        return [(w, pos) for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)", pos)]
    
    def important_word(self):
        return " ".join([w for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
    def important_lemma(self):
        return " ".join([l for _,  pos , l  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
parser = RegexpParser(grammar)

def chunk_text(tagged):
    chunks = parser.parse(tagged)
    index = 0
    segments = []
    for el in chunks:
        if type(el) == tree.Tree:
            chunk = Chunk(el.label(), el.leaves(), list(range(index, index + len(el.leaves()))))
            segments.append(chunk)
            index += len(el.leaves())
        else:
            index += 1
    return segments

def extract_phrase(sentences, merge_inplace=False):
    chunks = []
    for sentence in sentences:
        chunks.append(chunk_text(sentence))
    if merge_inplace:
        return [merge_adjacent_chunks(chunk) for chunk in chunks]
    return chunks  

def merge_adjacent_chunks(chunks):
    merged = []
    previous_label = ""
    for chunk in chunks:
        if chunk.label == previous_label and chunk.label != "prep_noun":
            merged[-1] = Chunk(chunk.label, 
                               merged[-1].tokens + chunk.tokens, 
                               merged[-1].indices + chunk.indices)
        else:
            merged.append(chunk)
        previous_label = chunk.label
    return merged

def compute_combinations(sentences, n):
    return [chunks[i:i+n] for chunks in sentences for i in range(len(chunks)-(n-1))]
  

## Compute linguistic pattern combinations

In [19]:
def compute_linguistic_patterns(df_series, n):
    pattern_dictionary = {}

    for vals in tqdm_notebook(df_series.values):
        sents = extract_phrase(vals, True)
                            
        for combo in compute_combinations(sents, n):
            key = tuple([c.label for c in combo])
            counter_key =  tuple([c.text.lower() for c in combo])
            
            if key not in pattern_dictionary.keys():
                pattern_dictionary[key]=Counter()

            pattern_dictionary[key][counter_key]+=1
                        
    return pattern_dictionary 

## Regular expression matches for themes of interest.
Focusing tagging verbs and tagging second argument component of verbs.

In [20]:
def regex_for_theme(text):
    if re.search(r"self\s?(-|\s)\s?employ", text.lower()):
        return "self-employ"
    if re.search(r"(deliver(y|(ies)|(ed)))|(slot)|(online shopping)", text.lower()):
        return "delivery"
    if re.search(r"vulnerable", text.lower()):
        return "vulnerable"
    if re.search(r"disab((led)|(ility))", text.lower()):
        return "disabled"
    if re.search(r"no symptom", text.lower()):
        return "no-symptoms"
    if re.search(r"((corona)?(virus))|(covid)", text.lower()):
        return "covid-mention"
    if re.search(r"""((health)|(heart) (problem)|(issue)|(condition)|(attack)|(disease)|(failure))|( ms)|"""+
                 """(copd)|(asthma)|((type)\s?[12])|(diabet)|"""+
                 """(cancer)|(dementia)|(stroke)|(illness)|(a type$)|(cough)|(leukaemia)""", text.lower()):
        return "health-problem"
    if re.search(r"symptom", text.lower()):
        return "symptoms"
    if re.search(r"((at)?(\s(very\s)?high)?\srisk)|(risk list)", text.lower()):
        return "at-risk"
    if re.search(r"""((((a|'|’)m( (in|at)( my)?)?)|aged) (over(-|\s))?"""+
                 """(([789][0-9]($|s|\s))|(old)|(elderly)))|((over(-|\s))?[789][0-9] y)""", text.lower()):
        return "elderly"
    if re.search(r"(carer)|(care home)", text.lower()):
        return "carer"
    if re.search(r"(key\s?(\s|-)?\s?worker)|(nurse($|\s))|(essential worker)", text.lower()):
        return "key-worker"
    if re.search(r"can\s?(no|'|’)?t work", text.lower()):
        return "cannot-work"
    if re.search(r"no ((work)|(income)|(money)|(wage)|(salar))", text.lower()):
        return "no-income"
    if re.search(r"(furlough)|(fired)|(80 %)", text.lower()):
        return "laid-off"
    if re.search(r"""(((can\s?(no|'|’)?t (get|buy|(shop for)))|"""+
                 """((do not)?ha(ve|d) )(no|any|(not enough))?) (food|groceries))""", 
                 text.lower()):
        return "cannot-get-food"
    if re.search(r"can\s?(no|'|’)?t get ((med)|(prescription))", text.lower()):
        return "cannot-get-med"
    if re.search(r"(^med)|(prescription)", text.lower()):
        return "get-med"
    if re.search(r"(travel(\s(advi[sc]e)|(status))?)|(flight)|(destination)", text.lower()):
        return "travel"
    if re.search(r"""(no\s)(\w*\s)?((info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)|(mention))"""
                 , text.lower()):
        return "no-information"
    if re.search(r"""(info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)"""
                 , text.lower()):
        return "information"
    if re.search(r"""(no)\s((letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|"""+
                 """(alert)|(notice)|(communication))""", text.lower()):
        return "no-correspondence"
    if re.search(r"(letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|(alert)|(notice)", text.lower()):
        return "correspondence"
    if re.search(r"(no\s?((family)|(one)))|(nothing)|(nobody)", text.lower()):
        return "no-one"
    if re.search(r"no ((support)|(aid)|(help)|(assistance)|(access)|(priority))", text.lower()):
        return "no-support"
    if re.search(r"(support)|(aid)|(help)|(assistance)|(access)|(priority)", text.lower()):
        return "support"
    if re.search(r"(child)|((^|\s)son)|(daughter)", text.lower()):
        return "child"
    if re.search(r"""(parent)|(husband)|(wife)|(partner)|"""+
                 """((mo|fa)ther)|(famil(y|(ies)))|(m[uo]m)|(dad)""", text.lower()):
        return "family"
    if re.search(r"(rule)|(restriction)|(measure)|(rights)", text.lower()):
        return "rules"
    if re.search(r"((no)|(a(ny)?)) ((way)|(option)|(choice)|(means)|(idea))", text.lower()):
        return "uncertainty"
    if re.search(r"work ((for)|(in)|(at)|(on))", text.lower()):
        return "work"
    if re.search(r"((self\s|-)?isolat((ion)|(e)|(ing)))|(lock\s?(\s|-)?\s?down)", text.lower()):
        return "self-isolation"
    if re.search(r"(driv(ing|ers)\s)?licen[sc]e", text.lower()):
        return "license"
    if re.search(r"passport", text.lower()):
        return "passport"
    if re.search(r"pension", text.lower()):
        return "pension"
    if re.search(r"(^|\s)h((ome)|(ouse))", text.lower()):
        return "home-mention"
    if re.search(r"(employ)|(work)|(job)|(business)|(company)", text.lower()):
        return "work-mention"
    if re.search(r"(benefit)|(universal credit)|(eligible)|(esa)|(ssp)|(pip)|(allowance)", text.lower()):
        return "benefit"
    if re.search(r"(school)|(student)", text.lower()):
        return "school"
    if re.search(r"(food)|(supplies)|(shopping)|(groceries)", text.lower()):
        return "goods"
    if re.search(r"(money)|(grant)|(fund)|(relief)", text.lower()):
        return "given-money"
    if re.search(r"(bill)|(tax)|(mortgage)|(rent)|(loan)|(debt)|(fine)|(fee)|(insurance)", text.lower()):
        return "bills-to-pay"
    if re.search(r"scheme", text.lower()):
        return "scheme"
    if re.search(r"(^|\s)visa($|\s)", text.lower()):
        return "visa"
    if re.search(r"(data)|(cases)|(situation)|(stat(istic)?s?$)|(status)|(news)|(progress)", text.lower()):
        return "data"
    if re.search(r"dea((th)|d)", text.lower()):
        return "death"
    return "unknown"


In [21]:
regex_for_theme("stats")

'data'

In [22]:
def regex_group_verbs(verb):
    if re.search(r"""(f(i|(ou))nd)|(look)|(search)|(clarify)|(ask)|(read)|([ei]nquire)|"""+
                 """(obtain)|(seek)|(know)|((^|\s)see($|\s))|(understand)""", verb):
        return "find-smthg"
    if re.search(r"(access)|(check)|(complete)|(cancel)|(book)|(confirm)", verb):
        return "access-smthg"
    if re.search(r"(get)|(take)|(claim)|(receive)|(sent)|(collect)", verb):
        return "acquire-smthg"
    if re.search(r"(renew)|(change)|(update)|(inform$)|(notify)", verb):
        return "change-smthg"
    if re.search(r"(appl(y|(ied)))|(register)|(qualify)|(sign)", verb):
        return "apply-smthg"
    if re.search(r"pa(y|(id)|(yed))", verb):
        return "pay-smthg"
    if re.search(r"(contact)|(report)", verb):
        return "contact-smthg"
    if re.search(r"(work)|(employ)", verb):
        return "work-smwhr"
    if re.search(r"(need)|(want)|(require)|(request)|(would like)|(order)", verb):
        return "need-smthg"
    if re.search(r"(have)|((a|'|’|^)m($|\s))|(feel($|\s))", verb):
        return "my-situation"
    if re.search(r"(has)|(((a|we)|'|’|^)re($|\s))", verb):
        return "others-situation"
    if re.search(r"(had)|((i|'|’|^)s($|\s))|(was)", verb):
        return "unclear-situation"
    if re.search(r"travel", verb):
        return "travel"
    if re.search(r"(liv(e|(ing)))|(stay)", verb):
        return "living"
    if re.search(r"(do)|(make)", verb):
        return "do-smthng"
    if re.search(r"go($|\s)", verb):
        return "go-smwhr"
    if re.search(r"(give)|(provide)", verb):
        return "give-smthng"
    if re.search(r"(help)|(protect)|(support)", verb):
        return "help"
    return "unknown"

## Test run code.

In [23]:
example = df.iloc[7]
example = df[df.Q3_x.str.contains("letter")].iloc[0]
print(f"Themetatic category for entire comment: {regex_for_theme(example.Q3_x)}")

print(example.Q3_x)
print()
print(example.pos_tag)
print()
for sent in extract_phrase(example.pos_tag, True):
    for chunk in sent:
        theme = ""
        if chunk.label in ["verb"]:
            theme = regex_group_verbs(chunk.text.lower())
        if chunk.label in ["noun", "prep_noun", "noun_verb"]:
            theme = regex_for_theme(chunk.text.lower()) 
            
        print("{0:10} {1:35} {2:20} {3}".format(chunk.label.upper(), chunk.text, theme, chunk.indices))
    print()
    for combo in compute_combinations([sent], 2):
        print(f"{combo[0].text}, {combo[1].text}")
        
#     for combo in compute_combinations([sent], 3):
#         print(f"{combo[0].text}, {combo[1].text}, {combo[2].text}")
    print("=====")

Themetatic category for entire comment: vulnerable
I have heard about free food parcels for the extremely-vulnerable, I have received a letter to stay in but have no idea how to request a parcel for I believe toiletries etc

[[('I', 'PRP', '-PRON-'), ('have', 'VBP', 'have'), ('heard', 'VBN', 'hear'), ('about', 'IN', 'about'), ('free', 'JJ', 'free'), ('food', 'NN', 'food'), ('parcels', 'NNS', 'parcel'), ('for', 'IN', 'for'), ('the', 'DT', 'the'), ('extremely', 'RB', 'extremely'), ('-', 'HYPH', '-'), ('vulnerable', 'JJ', 'vulnerable'), (',', ',', ','), ('I', 'PRP', '-PRON-'), ('have', 'VBP', 'have'), ('received', 'VBN', 'receive'), ('a', 'DT', 'a'), ('letter', 'NN', 'letter'), ('to', 'TO', 'to'), ('stay', 'VB', 'stay'), ('in', 'RB', 'in'), ('but', 'CC', 'but'), ('have', 'VBP', 'have'), ('no', 'DT', 'no'), ('idea', 'NN', 'idea'), ('how', 'WRB', 'how'), ('to', 'TO', 'to'), ('request', 'VB', 'request'), ('a', 'DT', 'a'), ('parcel', 'NN', 'parcel'), ('for', 'IN', 'for'), ('I', 'PRP', '-PRON-

I have a mother in law who is 90yrs old, lives on her own, has diabetes and asthma, 
so there for self isolating. I am checking on her daily, taking food etc. 
I go there once a week to change her bedding, which she is safely in her conservatory. 
I work for a train company but not in a safety critical role, 
I only serve drinks/snacks, which has been suspended now. 
They want me to come in to clean inside trains ( where the public are) 
and the stations. I have said that I don’t want to come in as caring for my mother in law. They have told me 
that I will have to go off sick and get a sick note from GP? 
Then I will only get statutory sick pay! I don’t want to run the risk 
of infecting my at risk elderly parent. Where do I stand?

## Inspect arg1-arg2 grammatical patterns

In [24]:
pattern_d = compute_linguistic_patterns(df.pos_tag, 2)
pattern_d.keys(), len(pattern_d)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))




(dict_keys([('verb', 'noun'), ('noun', 'adjective'), ('adjective', 'prep_noun'), ('verb', 'adjective'), ('adjective', 'punct'), ('noun', 'prep_noun'), ('prep_noun', 'noun'), ('pronoun', 'verb'), ('prep_noun', 'verb'), ('punct', 'verb'), ('noun', 'cc'), ('cc', 'noun'), ('noun', 'punct'), ('punct', 'rb'), ('rb', 'prep_noun'), ('noun', 'verb'), ('punct', 'pronoun'), ('cc', 'rb'), ('prep_noun', 'adjective'), ('adjective', 'noun'), ('pronoun', 'rb'), ('rb', 'verb'), ('verb', 'punct'), ('verb', 'pronoun'), ('punct', 'noun'), ('punct', 'cc'), ('prep_noun', 'punct'), ('adjective', 'cc'), ('cc', 'verb'), ('rb', 'pronoun'), ('verb', 'prep_noun'), ('pronoun', 'cc'), ('prep_noun', 'prep_noun'), ('noun', 'pronoun'), ('pronoun', 'punct'), ('adjective', 'verb'), ('cc', 'pronoun'), ('adjective', 'pronoun'), ('verb', 'cc'), ('rb', 'adjective'), ('verb', 'noun_verb'), ('noun_verb', 'cc'), ('pronoun', 'noun'), ('noun', 'rb'), ('prep_noun', 'cc'), ('adjective', 'rb'), ('prep_noun', 'pronoun'), ('pronoun',

In [25]:
patterns_of_interest = [('verb', 'noun'),
('noun', 'prep_noun'),
('prep_noun', 'prep_noun'),
('verb', 'noun_verb'),
('verb', 'prep_noun'),
('noun', 'noun_verb'),
('noun_verb', 'prep_noun')
]

## Compute `arg1` - `arg2` co-occurrence db - couples

In [26]:
pattern_db = {}

for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
#         arg2 = " ".join([w.lower() for w,_ in combo[1].tagable_words()])
        
        if key not in pattern_db.keys():
            pattern_db[key] = {}
        if arg1 not in pattern_db[key].keys():
            pattern_db[key][arg1] = Counter()
            
        pattern_db[key][arg1][arg2]+=1

print(f"There are {len(pattern_db)} possible grammatical combos.")
for i, (k,v) in enumerate(sorted(pattern_db.items(),
                         key = lambda x: len(x[1].values()),
                         reverse= True)[0:15],
                                 1):
    print(k, len(v))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))


There are 73 possible grammatical combos.
('verb', 'noun') 9341
('noun', 'punct') 4251
('noun', 'prep_noun') 3513
('noun', 'verb') 3435
('verb', 'pronoun') 2649
('verb', 'adjective') 2499
('noun', 'cc') 2352
('prep_noun', 'punct') 2319
('verb', 'punct') 1780
('prep_noun', 'noun') 1594
('noun', 'pronoun') 1556
('prep_noun', 'verb') 1484
('prep_noun', 'prep_noun') 1403
('prep_noun', 'cc') 1064
('noun', 'rb') 1027


In [27]:
top_100_verbs = [key.lower() for key, value in sorted(pattern_db[('verb', 'noun')].items(), 
                         key = lambda x: sum(x[1].values()), 
                         reverse= True)[0:100]]
counter = 0
for verb in top_100_verbs:
    if regex_group_verbs(verb)== "unknown":
        counter+=1
        print(counter, verb)
        

1 got
2 to tax
3 can
4 came to
5 having
6 driving
7 use
8 regarding
9 says
10 say
11 following
12 to send


In [28]:
def update_argument_theme_dictionary(dict_new, dict_old):
    for theme, value in dict_new.items():
        if theme not in dict_old.keys():
            dict_old[theme] = Counter()
        for val,count in value.items():
            dict_old[theme][val]+=count

In [29]:
verb_themes = {}
verb_argument_themes = {}
argument_themes = {}

for pattern in [('verb', 'noun'), ('verb', 'prep_noun')]:
    print(f"There are {len(pattern_db[pattern])} {pattern[0]}s, accompanied by {pattern[1]}s.")
    for i, (arg1, arg2) in enumerate(sorted(pattern_db[pattern].items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     1):
        verb_theme = f"{regex_group_verbs(arg1)}".upper()

        if verb_theme not in verb_themes.keys():
            verb_themes[verb_theme] = Counter()
        
        verb_themes[verb_theme][arg1] += sum(arg2.values())  
        
#         print(f"{i}. {arg1} :: {sum(arg2.values())} [{verb_theme}] \n-----------")
        
        if verb_theme not in verb_argument_themes.keys():
            verb_argument_themes[verb_theme] = {}

        local_themes = {}
        
        for j, (arg2_val, arg2_counts) in enumerate(arg2.items(), 1):
            theme = f"{regex_for_theme(arg2_val)}".upper()
            if theme not in local_themes.keys():
                local_themes[theme] = Counter()
                
            if theme not in argument_themes.keys():   
                argument_themes[theme] = Counter()   
                
            local_themes[theme][arg2_val]+=arg2_counts   
            argument_themes[theme][arg2_val]+=arg2_counts  
            
        update_argument_theme_dictionary(local_themes, verb_argument_themes[verb_theme])
#             print(f"{j}. {arg2_val} : {arg2_counts} [{regex_for_theme(arg1 +' '+arg2_val)}]")
#         for l, (key,value) in enumerate(sorted(local_themes.items(),
#                              key = lambda x: sum(x[1].values()),
#                              reverse= True)[0:10],
#                                      1):
#             print(f"{l}. {key}:: {sum(value.values())}")
#             for argument, count in value.most_common(5):
#                 print(f"{argument}: {count}")
#             print("")
#         print("=======")
        

There are 9341 verbs, accompanied by nouns.
There are 358 verbs, accompanied by prep_nouns.


In [30]:
len(argument_themes), len(verb_themes)

(44, 19)

In [31]:
for i, (argument_type, argument_values) in enumerate(sorted(argument_themes.items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     0):
    print(i, argument_type)

0 UNKNOWN
1 INFORMATION
2 WORK-MENTION
3 CORRESPONDENCE
4 SUPPORT
5 DELIVERY
6 HEALTH-PROBLEM
7 BILLS-TO-PAY
8 GOODS
9 COVID-MENTION
10 FAMILY
11 HOME-MENTION
12 VULNERABLE
13 LICENSE
14 GIVEN-MONEY
15 CHILD
16 BENEFIT
17 NO-ONE
18 ELDERLY
19 SELF-ISOLATION
20 DATA
21 NO-INFORMATION
22 UNCERTAINTY
23 PENSION
24 AT-RISK
25 TRAVEL
26 LAID-OFF
27 RULES
28 SCHOOL
29 KEY-WORKER
30 NO-INCOME
31 CARER
32 PASSPORT
33 GET-MED
34 DEATH
35 NO-SUPPORT
36 SYMPTOMS
37 VISA
38 NO-CORRESPONDENCE
39 DISABLED
40 SCHEME
41 SELF-EMPLOY
42 NO-SYMPTOMS
43 WORK


In [32]:
for i, (verb_type, verb_values) in enumerate(sorted(verb_themes.items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     0):
    print(i, verb_type)    
    for verb_value, count in verb_values.most_common(10):
        print(verb_value, count)

0 UNKNOWN
got 78
to tax 55
can 48
came to 32
having 28
driving 24
use 24
say 22
regarding 21
says 20
1 MY-SITUATION
have 830
am 353
have had 90
'm 54
am in 53
am on 50
do not have 40
’m 33
do n’t have 27
do n't have 26
2 ACQUIRE-SMTHG
to get 335
get 218
received 149
can not get 101
can get 89
have received 65
have not received 65
ca n't get 42
take 40
to claim 39
3 FIND-SMTHG
to find 170
looking for 130
to find out 88
was looking for 57
can not find 54
to see 51
to find out about 46
to look for 43
find 40
to find out if 33
4 UNCLEAR-SITUATION
is 563
had 174
was 76
's 37
is in 37
is on 32
is not 27
was in 26
’s 21
was on 18
5 OTHERS-SITUATION
has 215
are 151
are in 59
has had 23
has been 11
are on 10
re 9
are not 9
were 9
care for 7
6 NEED-SMTHG
need 288
wanted 40
want 31
needs 26
needed 21
would like 17
to order 12
require 10
need to speak to 10
to request 8
7 APPLY-SMTHG
to register 55
to register as 39
to apply for 33
to register for 28
registered 17
apply for 16
register 16
to regis

In [33]:
for i, (key,value) in enumerate(sorted([(k,v) for k,v in verb_argument_themes.items() if k != "UNKNOWN"],
                                       key = lambda x: sum([sum(counter.values()) for counter in x[1].values()]),
                                      reverse=True),1):

    print(f"{i}. {key} {sum([sum(counter.values()) for counter in value.values()])} {len(value)} \n======")
    for j, (argument, counter) in enumerate(sorted([(k,v) for k,v in value.items() if k != "UNKNOWN"],
                                                   key = lambda x: sum(x[1].values()),
                                                   reverse=True
                                                  )
                                            , 1):

        print(f"{argument}")
        for l, (arg_theme, vals) in enumerate(counter.most_common(5)):
            print(f"{arg_theme}: {vals}")
        print("---")
    print()
            

1. MY-SITUATION 3208 43 
HEALTH-PROBLEM
copd: 42
asthma: 20
diabetes: 15
health problems: 11
cancer: 11
---
INFORMATION
the list: 13
list: 11
the form: 6
details: 4
a number: 3
---
WORK-MENTION
work: 42
a job: 6
an employee: 4
job: 4
2 jobs: 3
---
CORRESPONDENCE
a letter: 48
letter: 7
letters: 4
email: 2
letter.no: 1
---
ELDERLY
77 years: 8
73 years: 8
81 years: 5
80 years: 5
71 years: 5
---
VULNERABLE
a vulnerable person: 26
extremely vulnerable person: 5
the vulnerable list: 4
extremely vulnerable group: 4
the vulnerable category: 3
---
NO-ONE
nothing: 24
no one: 16
no family: 15
nobody shops: 1
nothing other: 1
---
HOME-MENTION
home: 27
housebound: 5
house: 5
the house: 3
home sats: 1
---
FAMILY
husband: 4
a single parent: 4
wife: 3
family: 2
mother: 2
---
NO-INCOME
no income: 21
no money: 14
no work: 5
no wages: 3
no money national insurance number: 1
---
BILLS-TO-PAY
a mortgage: 4
a tax rebate: 4
bills: 3
debt: 2
tax return: 2
---
COVID-MENTION
covid-19: 9
covid 19: 7
the virus: 6

of the symptoms: 1
---
DISABLED
a disabled person: 2
disabled child 24/7: 1
---
VISA
visa: 1
visa appeal application: 1
uk visa: 1
---
AT-RISK
the high risk definitions: 1
about the high risk group: 1
---
NO-SUPPORT
no help: 1
---
KEY-WORKER
a key worker son: 1
---
GET-MED
a repeat prescription: 1
---

4. UNCLEAR-SITUATION 1888 41 
INFORMATION
a link: 6
info: 5
information: 5
list: 5
guidance: 4
---
CORRESPONDENCE
a letter: 24
a message: 10
letter: 7
a text: 5
a text message: 2
---
NO-INFORMATION
no information: 21
no guidance: 6
no mention: 5
no advice: 4
no link: 3
---
HEALTH-PROBLEM
a stroke: 5
an issue: 3
type 1 diabetic: 2
breast cancer: 2
a heart attack: 2
---
WORK-MENTION
work: 17
employer: 4
a care worker: 3
a company: 2
business: 2
---
NO-ONE
nothing: 25
no one: 3
nobody: 2
nothing similar: 1
nothing online: 1
---
UNCERTAINTY
no way: 18
no option: 5
any way: 3
a way: 3
no means: 2
---
COVID-MENTION
the virus: 5
covid-19: 5
this virus: 2
covid 19: 2
covid 19 positive: 1
---
SUP

PENSION
pension credits: 1
pensioners free food box: 1
state pension: 1
---
CORRESPONDENCE
a vunerabje adult the 1on the letter: 1
the sms text support service: 1
---
PASSPORT
passport: 1
a new british passport: 1
---
KEY-WORKER
nurse: 2
---
NO-ONE
no one: 1
a complain nothing: 1
---
SELF-ISOLATION
self isolation: 1
---
DISABLED
an elderly disabled pensioner: 1
---
CARER
carers uk: 1
---
RULES
2 meter rule: 1
---
NO-SUPPORT
no help: 1
---

8. ACCESS-SMTHG 614 31 
INFORMATION
information: 6
details: 5
the form: 5
advice: 4
updates: 3
---
DELIVERY
a slot: 12
a delivery slot: 6
home delivery: 4
food deliveries: 3
delivery: 2
---
BILLS-TO-PAY
the tax: 4
tax code: 3
car insurance: 3
tax return: 3
car tax: 2
---
DATA
progress: 3
the situation: 2
the progress: 2
status: 2
cases: 2
---
COVID-MENTION
coronavirus: 3
the virus: 3
coronavirus statistics: 2
covid-19: 2
the covid: 1
---
GOODS
food: 5
food parcel: 3
any essential groceries: 1
the online food services: 1
food the stores: 1
---
WORK-ME

---
BILLS-TO-PAY
feet: 1
---
GIVEN-MONEY
no funds: 1
---
CORRESPONDENCE
letter: 1
---
SELF-ISOLATION
isolation: 1
---
SCHOOL
all students: 1
---

14. CONTACT-SMTHG 237 18 
WORK-MENTION
employer: 3
work: 2
a company: 1
business open: 1
the jobcentre: 1
---
DEATH
a death: 5
the death: 2
a death bereavement: 1
death: 1
---
CHILD
son: 2
child maintenance: 2
the child maintenance service: 1
child benefit department: 1
---
FAMILY
fathers deah: 1
mum death: 1
the other parent: 1
a parent: 1
a change mothers circumstances: 1
---
INFORMATION
the telephone number: 1
the contact centres: 1
a change details: 1
new e - mail address:-: 1
details: 1
---
BENEFIT
maternity allowance: 1
the esa team: 1
esa a few weeks: 1
universal credit: 1
the pip service: 1
---
DATA
mse news department: 1
the situation: 1
cases: 1
---
CORRESPONDENCE
hmrc email scam: 1
a tv scam email: 1
email: 1
---
COVID-MENTION
covid19 lockdown breach: 1
covid 19: 1
---
HEALTH-PROBLEM
a medical condition: 1
an issue: 1
---
BILLS-TO-

## Assign themes to actions and things people are talking about 
### Tag response comments (Q3) with appropriate themes

In [34]:
phrase_mentions = []
for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    phrase_mentions.append([])
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
        
        if key in [('verb', 'noun'), ('verb', 'prep_noun'), 
                   ('verb', 'noun_verb'), ('noun','prep_noun'),
                  ('prep_noun','noun'), ('prep_noun','prep_noun')]:
            mention_theme = f"{regex_group_verbs(arg1)} - {regex_for_theme(arg2)}"
            
            arg1 = re.sub(r"\(|\)|\[|\]|\+", "", arg1)
            arg2 = re.sub(r"\(|\)|\[|\]|\+", "", arg2)
            phrase = f"{arg1} {arg2}"
            phrase_mentions[-1].append((key, phrase, mention_theme, (arg1,arg2)))
            
df['theme_mentions'] = phrase_mentions       

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))




In [35]:
df[df['theme_mentions'].str.len()>0].iloc[100].theme_mentions

[(('prep_noun', 'noun'),
  'for 2017 tax year',
  'unknown - unknown',
  ('for 2017 tax', 'year')),
 (('verb', 'noun'),
  'are taking the michael',
  'others-situation - unknown',
  ('are taking', 'the michael'))]

In [36]:
df['theme_mentions_list'] = df['theme_mentions'].map(lambda x: [mention for key,_,mention,_ in x])

In [37]:
def get_user_group(arg1, arg2):
    if re.search(r"((('|’|^(a)?)m)|(have been)|(feel))$", arg1):
        return re.sub(r"^((the)|a)\s","", arg2)
    return ""

def resolve_function(x):
    res = [get_user_group(*args) for theme,_,_,args in x if "verb" in theme[0]]
    return [r for r in res if r != ""]

In [38]:
args = [[('verb', 'noun'),None,None,('feel', 'the key-worker')]]
resolve_function(args)

['key-worker']

In [39]:
df['theme_mentions_user'] = df['theme_mentions'].map(resolve_function)

In [40]:
user_groups = Counter()
for vals in df[df['theme_mentions_user'].str.len()>0].theme_mentions_user.values:
    for val in vals:
        user_groups[val] +=1
user_groups.most_common(10), "housebound" in user_groups, len(user_groups)

([('self employed', 64),
  ('self isolating', 49),
  ('key worker', 23),
  ('vulnerable person', 18),
  ('carer', 10),
  ('77 years', 7),
  ('73 years', 7),
  ('director', 6),
  ('81 years', 5),
  ('self - employed', 5)],
 True,
 383)

In [41]:
import regex
from difflib import SequenceMatcher as SM
from nltk.util import ngrams
import codecs

needle = "told register as a vulnerable person"
hay    = "told to register as a vulnerable person for delivery service for on line shopping"

def find_needle(needle, hay):
    needle_length  = len(needle.split())
    max_sim_val    = 0
    max_sim_string = u""
#     print(needle)
    for ngram in ngrams(hay.split(), needle_length + int(.65*needle_length)):
        hay_ngram = u" ".join(ngram)
        similarity = SM(None, hay_ngram, needle).ratio() 
        if similarity > max_sim_val:
            max_sim_val = similarity
            max_sim_string = hay_ngram
    
    if max_sim_string == "":
        max_sim_string = hay

    tokens = needle.split(" ")
    if len(tokens) == 1:
        expression = tokens[0]
    else:
        expression = f"({tokens[0]}).*({tokens[-1]})"
    result = regex.search(expression, max_sim_string)
    
    if result is not None:
        pattern = result.group()
        
        return {needle: pattern}
    return {needle:None}

print(find_needle(needle, hay))
print(find_needle("housebound", "i am housebound"))

{'told register as a vulnerable person': 'told to register as a vulnerable person'}
{'housebound': 'housebound'}


In [42]:
df[['theme_mentions', "Q3_pii_removed"]].iloc[142].values

array([list([(('verb', 'noun'), 'trying to get an update', 'acquire-smthg - unknown', ('trying to get', 'an update')), (('verb', 'noun'), 'for driving lorry license', 'unknown - license', ('for driving', 'lorry license')), (('noun', 'prep_noun'), 'lorry license after illness', 'unknown - health-problem', ('lorry license', 'after illness')), (('prep_noun', 'prep_noun'), 'after illness at christmas', 'unknown - unknown', ('after illness', 'at christmas')), (('verb', 'noun'), 'needing to get back to work', 'acquire-smthg - work-mention', ('needing to get back to', 'work')), (('noun', 'prep_noun'), 'work as hgv', 'work-smwhr - unknown', ('work', 'as hgv')), (('prep_noun', 'noun'), 'as hgv key worker driver', 'unknown - key-worker', ('as hgv', 'key worker driver'))]),
       'Trying to get an up[date from medical/re application for driving/lorry license after illness at Christmas 2019.  Needing to get back to work as HGV key worker/driver.'],
      dtype=object)

In [43]:
## remove special characters
df["Q3_pii_removed"] = df["Q3_pii_removed"].replace(np.nan, '', regex=True)
df["Q3_pii_removed"] = df["Q3_pii_removed"].progress_map(lambda x: ' '.join(
                                    re.sub(r"\(|\)|\[|\]|\+", "", x).split()))

df["Q3_x_edit"] = df["Q3_x"].replace(np.nan, '', regex=True)
df["Q3_x_edit"] = df["Q3_x_edit"].progress_map(lambda x: ' '.join(re.sub(r"\(|\)|\[|\]|\+", "", x).split()))

100%|██████████| 10506/10506 [00:00<00:00, 147796.93it/s]
100%|██████████| 10506/10506 [00:00<00:00, 137349.71it/s]


## Create columns for `phrases` and `user_groups`

In [44]:
df[['theme_mentions', "Q3_x_edit"]].iloc[6].values

array([list([(('verb', 'noun'), 'told to register as a vulnerable person', 'apply-smthg - vulnerable', ('told to register as', 'a vulnerable person')), (('noun', 'prep_noun'), 'a vulnerable person for delivery', 'unknown - delivery', ('a vulnerable person', 'for delivery')), (('prep_noun', 'noun'), 'for delivery service', 'living - unknown', ('for delivery', 'service')), (('noun', 'prep_noun'), 'service on line', 'unknown - unknown', ('service', 'on line')), (('prep_noun', 'noun'), 'on line shopping', 'unknown - goods', ('on line', 'shopping'))]),
       'told to register as a vulnerable person for delivery service for on line shopping'],
      dtype=object)

In [None]:
df['phrases_dict'] = df[['theme_mentions', "Q3_x_edit"]][:].\
            progress_apply(lambda x: [find_needle(phrase, x[1].lower()) for _,phrase,_,_  in x[0]], axis=1)
df['phrases_list'] = df['phrases_dict'].progress_map(lambda x: [value for phrase_dict in x 
                                                                 for value in phrase_dict.values() 
                                                                 if value is not None]
                                                if not isinstance(x, float) else [])
df['phrases'] = df['phrases_list'].progress_map(lambda x: ", ".join(x))

  8%|▊         | 790/10506 [00:17<06:02, 26.77it/s] 

In [None]:
phrase_counts = Counter()
for phrase_list in df.phrases_list.values:
    for phrase in phrase_list:
        phrase_counts[phrase]+=1
phrase_counts.most_common(50)

In [None]:
df[df['phrases']!=''][['phrases', 'Q3_x_edit']].head(10)

In [None]:
df['user_phrases_dict'] = df[['theme_mentions_user', "Q3_x_edit"]][:].\
            progress_apply(lambda x: [find_needle(phrase, x[1].lower()) for phrase  in x[0]], axis=1)
df['user_phrases_list'] = df['user_phrases_dict'].progress_map(lambda x: [value for phrase_dict in x 
                                                                 for value in phrase_dict.values() 
                                                                 if value is not None]
                                                if not isinstance(x, float) else [])

df['user_phrases'] = df['user_phrases_list'].progress_map(lambda x: ", ".join(x))

In [None]:
user_groups = Counter()
for vals in df[df['user_phrases_list'].str.len()>0].user_phrases_list.values:
    for val in vals:
        user_groups[val] +=1
user_groups.most_common(10), "housebound" in user_groups, len(user_groups)

### Inspect missing stuff

In [None]:
missing = 0
for phrase_list, comment in df[~df['phrases_dict'].isna()][['phrases_dict', 'Q3_x_edit']].values:
    for phrase_dict in phrase_list:
        for key,value in phrase_dict.items():
            if str(value) not in comment.lower():
                missing+=1
missing       

## Save results for tool

In [2190]:
df_sub = df[['primary_key', 'intents_clientID', 'visitId', 'fullVisitorId',
       'hits_pagePath', 'Started', 'Ended', 'Q1_x', 'Q2_x', 'Q3_x_edit', 'Q4_x',
       'Q5_x', 'Q6_x', 'Q7_x', 'Q8_x', 'session_id', 'dayofweek', 'isWeekend',
       'hour', 'country', 'country_grouping', 'UK_region', 'UK_metro_area',
       'channelGrouping', 'deviceCategory',
       'total_seconds_in_session_across_days',
       'total_pageviews_in_session_across_days', 'finding_count',
       'updates_and_alerts_count', 'news_count', 'decisions_count',
       'speeches_and_statements_count', 'transactions_count',
       'regulation_count', 'guidance_count', 'business_support_count',
       'policy_count', 'consultations_count', 'research_count',
       'statistics_count', 'transparency_data_count',
       'freedom_of_information_releases_count', 'incidents_count',
       'done_page_flag', 'count_client_error', 'count_server_error',
       'ga_visit_start_timestamp', 'ga_visit_end_timestamp',
       'intents_started_date', 'events_sequence', 'search_terms_sequence',
       'cleaned_search_terms_sequence', 'top_level_taxons_sequence',
       'page_format_sequence', 'Sequence', 'PageSequence', 'flag_for_criteria',
       'full_url_in_session_flag', 'UserID', 'UserNo', 'Name', 'Email',
       'IP Address', 'Unique ID', 'Tracking Link', 'clientID', 'Page Path',
       'Q1_y', 'Q2_y', 'Q3_y', 'Q4_y', 'Q5_y', 'Q6_y', 'Q7_y', 'Q8_y',
       'Started_Date', 'Ended_Date', 'Started_Date_sub_12h', 'phrases', 'user_phrases']]

df_sub.rename(columns={'Q3_x_edit':'Q3_x'}, inplace=True)
df_sub.to_csv(os.path.join(DATA_DIR, 'uis_20200401_20200409_phrases_user_groups.csv'), index=False)