In [1]:
# !pip3 install wordcloud
# !pip3 install polyglot
# !pip3 install pyicu
# !pip3 install pycld2
# !pip3 install morfessor
# !pip3 install polyglot
# !pip3 install fuzzywuzzy
# !pip3 install seaborn

In [2]:
import os
import pandas as pd
import sys
import numpy as np 
import spacy
import nltk
from nltk import word_tokenize, sent_tokenize, RegexpParser, tree
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process

from tqdm import tqdm_notebook, tqdm
from collections import Counter
import re
import operator
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from ast import literal_eval

import string 

## https://markhneedham.com/blog/2017/11/28/python-polyglot-modulenotfounderror-no-module-named-icu/
from polyglot.detect import Detector

tqdm.pandas()

  from pandas import Panel


## Load in the survey data

In [3]:
DATA_DIR = "../../data"

survey_filename = os.path.join(DATA_DIR, "uis_20200401_20200409.csv")
df = pd.read_csv(survey_filename)

### Some row duplication present

In [4]:
print(f"rows: {df.shape[0]}\nunique clientIds: {df.intents_clientID.nunique()}")
print(f"unique primary key: {df.primary_key.nunique()}\nunique session_ids: {df.session_id.nunique()}\n")
# print(df.columns)
print(df[df.session_id.isna()].shape)

rows: 61064
unique clientIds: 8030
unique primary key: 10613
unique session_ids: 14062

(2970, 78)


In [5]:
## the closer these numbers are to # unique primary_key, the better
df.Q3_y.nunique(), df.Q3_x.nunique()

(9601, 9601)

In [6]:
df.drop_duplicates("primary_key", inplace = True)
df.reset_index(inplace=True, drop=True)

## Functions for sentence tokenization, part of speech tagging, PII placeholder stripping, ngram computation

In [7]:
nlp = spacy.load("en_core_web_sm")

pii_filtered = ["DATE_OF_BIRTH", "EMAIL_ADDRESS", "PASSPORT", "PERSON_NAME", 
                "PHONE_NUMBER", "STREET_ADDRESS", "UK_NATIONAL_INSURANCE_NUMBER", "UK_PASSPORT"]
pii_regex = "|".join([f"\\[{p}\\]" for p in pii_filtered])
pii_regex

'\\[DATE_OF_BIRTH\\]|\\[EMAIL_ADDRESS\\]|\\[PASSPORT\\]|\\[PERSON_NAME\\]|\\[PHONE_NUMBER\\]|\\[STREET_ADDRESS\\]|\\[UK_NATIONAL_INSURANCE_NUMBER\\]|\\[UK_PASSPORT\\]'

In [8]:
stop_words = list(stopwords.words('english'))
punctuation = list(string.punctuation) + ['’']
token_blacklist = stop_words + punctuation + pii_filtered

def split_sentences(comment):
    return nltk.sent_tokenize(comment)

def remove_stopwords_punctation(sentences):
    return [[(t[0], t[1], t[2]) for t in sent if t[0].lower() not in token_blacklist] for sent in sentences]

def replace_pii_regex(text):
    return re.sub(pii_regex, "", text)

def part_of_speech_tag(comment):
    sentences = split_sentences(comment)
    return [[(token.text, token.tag_, token.lemma_) for token in nlp(sentence)] for sentence in sentences]

In [9]:
t = "This is a test with punctuation’. this is another sentence."
processed_t = part_of_speech_tag(t)
processed_t

[[('This', 'DT', 'this'),
  ('is', 'VBZ', 'be'),
  ('a', 'DT', 'a'),
  ('test', 'NN', 'test'),
  ('with', 'IN', 'with'),
  ('punctuation', 'NN', 'punctuation'),
  ('’', "''", "'"),
  ('.', '.', '.')],
 [('this', 'DT', 'this'),
  ('is', 'VBZ', 'be'),
  ('another', 'DT', 'another'),
  ('sentence', 'NN', 'sentence'),
  ('.', '.', '.')]]

## Detect feedback language
There is a bit of foreign language spam in some responses, detect non (primarily) english comments and drop

In [10]:
def detect_language(text):
    if text!="-":
        try:
            langs = {language.confidence:language.code for language in Detector(text, quiet=True).languages}
            return langs[max(langs.keys())]
        except:
            return f"[ERROR] {text}"
    return "-"

In [11]:
df['Q3_pii_removed'] = df['Q3_x'].progress_map(replace_pii_regex)
df = df[(df.Q3_pii_removed.str.len()<4000)]
df['language'] = df['Q3_pii_removed'].progress_map(detect_language)

100%|██████████| 10613/10613 [00:00<00:00, 288121.19it/s]
  0%|          | 0/10600 [00:00<?, ?it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the la

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 43%|████▎     | 4600/10600 [00:00<00:00, 8087.70it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 73%|███████▎  | 7690/10600 [00:00<00:00, 9328.83it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
100%|██████████| 10600/10600 [00:01<00:00, 9929.10it/s]


In [12]:
lang_dist = df['language'].value_counts().to_dict()
print(f"Number of unique languages: {len(lang_dist)}")
print(f"English: {round((lang_dist['en']*100)/sum(lang_dist.values()), 2)}%")
print(f"-: {round((lang_dist['-']*100)/sum(lang_dist.values()), 2)}%")
list(lang_dist.items())[0:10]

Number of unique languages: 44
English: 90.58%
-: 8.3%


[('en', 9601),
 ('-', 880),
 ('un', 23),
 ('xh', 12),
 ('da', 10),
 ('gv', 9),
 ('gd', 7),
 ('it', 5),
 ('mg', 4),
 ('pl', 4)]

In [13]:
df['is_en'] = df['language'].isin(["en", "un", "-", "sco"])

In [14]:
df = df[df['is_en']]
df.shape

(10506, 81)

### Part of speech tag
Run this the first time and save, then just load df

In [15]:
df['pos_tag'] = df[['Q3_pii_removed', 'is_en']].progress_apply(lambda x: part_of_speech_tag(x[0]) 
                                                     if x[1] else [], axis=1)
df['lemmas'] = df['pos_tag'].progress_map(lambda x: [token[2] for sent in x for token in sent])

df['words'] = df['pos_tag'].progress_map(lambda x: [token[0] for sent in x for token in sent])

df.to_csv(os.path.join(DATA_DIR, "uis_20200401_20200409_lang_pos.csv"), index=False)
df = pd.read_csv(os.path.join(DATA_DIR, "uis_20200401_20200409_lang_pos.csv"))
df['pos_tag'] = df['pos_tag'].map(literal_eval)

## Extract noun and verb phrases

In [17]:
part_of_speech_tag(df.Q3_pii_removed.iloc[0])[0][0]

('-', ':', '-')

In [18]:
grammar = r"""
    cc:
    {<CC>}
    pronoun:
    {<DT><IN><PRP>}
    {<IN>?<PRP>}
    noun_verb:
    {<IN>?<JJ.*>*<NN.*>+<HYPH>?<VBD|VBN|VBG><NN.*>*}
    verb:
    {<IN|TO>*<VB.*><IN|RP|TO>?<RB|WRB|TO>*<IN|TO>*}
    {<MD><VB.*><RB><IN>+}
    {<WRB|WP><TO>?<VB.*>+}
    {<TO><VB><IN>}
    {<TO>?<VB.*><IN|RP>?<WRB|RP|WP>?<IN|TO>?<VB.*>?}
    {<VB.*><TO><VB.*><RB>*<TO>?}
    {<IN><EX><VB.*>}
    {<RB><TO><VB.*>+}
    {<TO>?<VB.*><IN|WDT|WP|RP>}
    {<WP><VB.*>}
    {<VB.*><RB.*>+<VB.*>*<IN|TO>?}
    {<WDT>?<TO>?<MD|VB.*>?<RB>?<TO|IN>?<V.*>+<CC>?<V.*>*<IN|RP>?<IN>*}
    {<MD><RB>*<VB.*>*}
    {<VB.*><IN|TO><IN>*}
    {<TO><VB.*><IN>*}
    {<VB.*>}
    prep_noun:
    {(<CD><IN><DT>)?<IN><JJ.*>*<NN.*>}
    {<IN><NN.*><JJ.*>?<NN.*>+}
    {<IN><NN.*><HYPH>?<NN.*>*}
    {<IN>+<PRP\$>?<NN><CD>?}
    {<IN><CD><.*>}
    {<RB><RBS>?<CD|JJ.*>?<NN.*>+}
    {<RP>?<IN>+<JJ.*>*<NN.*>+}
    {<IN><DT><NN.*><JJ.*>*<NN><HYPH>?<NN>}
    {<IN><NN.*>(<HYPH>?<NN.*>)?}
    {<JJ.*>*<IN><DT>?<NN.*>+<CD>?<NN.*>?}
    {<IN>+<DT>*<JJ>?<CD>?<NN.*>+<CD>?<NN.*>?}
    noun:
    {<CD><NN.*>}
    {<PDT><PRP\$><NN.*>+}
    {<RB><DT>?<JJ.*>*<NN.*>}
    {<DT><HYPH>?<NN.*>}
    {<JJ.*><NN.*>*<CD>}
    {<NN.*><CD><JJ.*>?}
    {<JJ.*|NN.*><IN|TO><PRP>}
    {<CD><NN.*><JJ.*>}
    {<WRB><RB><JJ.*>*<NN.*>*}
    {<DT><JJ.*>*<NN.*>+}
    {<NN.*><CD>?<JJ.*>*<NN.*>*}
    {<IN>+<CD>*<POS>*<IN>*<NN.*>}
    {<IN><PRP\$>?<JJ.*>*<NN.*>}
    {<NN.*><HYPH><NN.*>}
    {<DT>?<CD>?<JJ.*>?<CC>?<JJ.*>?<NN.*>+}
    {<NN.*><HYPH>?<NN.*|JJ.*|VB.*>*}
    {(<NN|NNS>|<NNP|NNPS>)<NNP|NN|NNS|NNPS>+}
    {(<NN|NNS>+|<NNP|NNPS>+)<IN|CC>(<PRP\$|DT><NN|NNS>+|<NNP|NNPS>+)}
    {<JJ|RB|CD>*<NNP|NN|NNS|NNPS>+}
    {<NNP|NN|NNS|NNPS>+}
    {<CD><IN>?<NN.*>}
    adjective:
    {<RB>*<JJ.*><CD>?}
    rb:
    {<RB>+}
    punct:
    {<-RRB->|<-LRB->|<,>|<.>}
    """

class Chunk:

    def __init__(self, label, tokens, indices):
        self.label = label
        self.tokens = tokens
        self.indices = indices
        self.text = self.text()
        self.lemma = self.lemma()
        self.important_lemma = self.important_lemma()
        self.important_word = self.important_word()

    def text(self):
        return " ".join([w for w,  _ , _  in self.tokens])
    
    def lemma(self):
        return " ".join([l for _,  _ , l  in self.tokens])
    
    def tagable_words(self):
        return [(w, pos) for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)", pos)]
    
    def important_word(self):
        return " ".join([w for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
    def important_lemma(self):
        return " ".join([l for _,  pos , l  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
parser = RegexpParser(grammar)

def chunk_text(tagged):
    chunks = parser.parse(tagged)
    index = 0
    segments = []
    for el in chunks:
        if type(el) == tree.Tree:
            chunk = Chunk(el.label(), el.leaves(), list(range(index, index + len(el.leaves()))))
            segments.append(chunk)
            index += len(el.leaves())
        else:
            index += 1
    return segments

def extract_phrase(sentences, merge_inplace=False):
    chunks = []
    for sentence in sentences:
        chunks.append(chunk_text(sentence))
    if merge_inplace:
        return [merge_adjacent_chunks(chunk) for chunk in chunks]
    return chunks  

def merge_adjacent_chunks(chunks):
    merged = []
    previous_label = ""
    for chunk in chunks:
        if chunk.label == previous_label and chunk.label != "prep_noun":
            merged[-1] = Chunk(chunk.label, 
                               merged[-1].tokens + chunk.tokens, 
                               merged[-1].indices + chunk.indices)
        else:
            merged.append(chunk)
        previous_label = chunk.label
    return merged

def compute_combinations(sentences, n):
    return [chunks[i:i+n] for chunks in sentences for i in range(len(chunks)-(n-1))]
  

## Compute linguistic pattern combinations

In [19]:
def compute_linguistic_patterns(df_series, n):
    pattern_dictionary = {}

    for vals in tqdm_notebook(df_series.values):
        sents = extract_phrase(vals, True)
                            
        for combo in compute_combinations(sents, n):
            key = tuple([c.label for c in combo])
            counter_key =  tuple([c.text.lower() for c in combo])
            
            if key not in pattern_dictionary.keys():
                pattern_dictionary[key]=Counter()

            pattern_dictionary[key][counter_key]+=1
                        
    return pattern_dictionary 

In [20]:
compute_linguistic_patterns(df.pos_tag[0:10], 2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




{('verb',
  'noun'): Counter({('signed up for', 'repatriation advice'): 1,
          ('told to register as', 'a vulnerable person'): 1,
          ('have', 'a mother'): 1,
          ('has', 'diabetes'): 1,
          ('am checking on', 'daily'): 1,
          ('taking', 'food'): 1,
          ('go there once', 'a week'): 1,
          ('to change', 'bedding'): 1,
          ('is safely in', 'conservatory'): 1,
          ('work for', 'a train company'): 1,
          ('serve', 'drinks snacks'): 1,
          ('to come in to clean inside', 'trains'): 1,
          ('do n’t want to come in as caring for', 'mother'): 1,
          ('get', 'a sick note'): 1,
          ('do n’t want to run', 'the risk'): 1,
          ('does survive', 'this horrible disease'): 1}),
 ('noun', 'adjective'): Counter({('repatriation advice', 'due'): 1}),
 ('adjective', 'prep_noun'): Counter({('due', 'to covid 19 outbreak'): 1}),
 ('verb',
  'adjective'): Counter({('to register as', 'vulnerable'): 1,
          ('is 90yrs', 

## Regular expression matches for themes of interest.
Focusing tagging verbs and tagging second argument component of verbs.

In [21]:
def regex_for_theme(text):
    if re.search(r"self\s?(-|\s)\s?employ", text.lower()):
        return "self-employ"
    if re.search(r"(deliver(y|(ies)|(ed)))|(slot)|(online shopping)", text.lower()):
        return "delivery"
    if re.search(r"vulnerable", text.lower()):
        return "vulnerable"
    if re.search(r"disab((led)|(ility))", text.lower()):
        return "disabled"
    if re.search(r"no symptom", text.lower()):
        return "no-symptoms"
    if re.search(r"((corona)?(virus))|(covid)", text.lower()):
        return "covid-mention"
    if re.search(r"""((health)|(heart) (problem)|(issue)|(condition)|(attack)|(disease)|(failure))|( ms)|"""+
                 """(copd)|(asthma)|((type)\s?[12])|(diabet)|"""+
                 """(cancer)|(dementia)|(stroke)|(illness)|(a type$)|(cough)|(leukaemia)""", text.lower()):
        return "health-problem"
    if re.search(r"symptom", text.lower()):
        return "symptoms"
    if re.search(r"((at)?(\s(very\s)?high)?\srisk)|(risk list)", text.lower()):
        return "at-risk"
    if re.search(r"""((((a|'|’)m( (in|at)( my)?)?)|aged) (over(-|\s))?"""+
                 """(([789][0-9]($|s|\s))|(old)|(elderly)))|((over(-|\s))?[789][0-9] y)""", text.lower()):
        return "elderly"
    if re.search(r"(carer)|(care home)", text.lower()):
        return "carer"
    if re.search(r"(key\s?(\s|-)?\s?worker)|(nurse($|\s))|(essential worker)", text.lower()):
        return "key-worker"
    if re.search(r"can\s?(no|'|’)?t work", text.lower()):
        return "cannot-work"
    if re.search(r"no ((work)|(income)|(money)|(wage)|(salar))", text.lower()):
        return "no-income"
    if re.search(r"(furlough)|(fired)|(80 %)", text.lower()):
        return "laid-off"
    if re.search(r"""(((can\s?(no|'|’)?t (get|buy|(shop for)))|"""+
                 """((do not)?ha(ve|d) )(no|any|(not enough))?) (food|groceries))""", 
                 text.lower()):
        return "cannot-get-food"
    if re.search(r"can\s?(no|'|’)?t get ((med)|(prescription))", text.lower()):
        return "cannot-get-med"
    if re.search(r"(^med)|(prescription)", text.lower()):
        return "get-med"
    if re.search(r"(travel(\s(advi[sc]e)|(status))?)|(flight)|(destination)", text.lower()):
        return "travel"
    if re.search(r"""(no\s)(\w*\s)?((info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)|(mention))"""
                 , text.lower()):
        return "no-information"
    if re.search(r"""(info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)"""
                 , text.lower()):
        return "information"
    if re.search(r"""(no)\s((letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|"""+
                 """(alert)|(notice)|(communication))""", text.lower()):
        return "no-correspondence"
    if re.search(r"(letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|(alert)|(notice)", text.lower()):
        return "correspondence"
    if re.search(r"(no\s?((family)|(one)))|(nothing)|(nobody)", text.lower()):
        return "no-one"
    if re.search(r"no ((support)|(aid)|(help)|(assistance)|(access)|(priority))", text.lower()):
        return "no-support"
    if re.search(r"(support)|(aid)|(help)|(assistance)|(access)|(priority)", text.lower()):
        return "support"
    if re.search(r"(child)|((^|\s)son)|(daughter)", text.lower()):
        return "child"
    if re.search(r"""(parent)|(husband)|(wife)|(partner)|"""+
                 """((mo|fa)ther)|(famil(y|(ies)))|(m[uo]m)|(dad)""", text.lower()):
        return "family"
    if re.search(r"(rule)|(restriction)|(measure)|(rights)", text.lower()):
        return "rules"
    if re.search(r"((no)|(a(ny)?)) ((way)|(option)|(choice)|(means)|(idea))", text.lower()):
        return "uncertainty"
    if re.search(r"work ((for)|(in)|(at)|(on))", text.lower()):
        return "work"
    if re.search(r"((self\s|-)?isolat((ion)|(e)|(ing)))|(lock\s?(\s|-)?\s?down)", text.lower()):
        return "self-isolation"
    if re.search(r"(driv(ing|ers)\s)?licen[sc]e", text.lower()):
        return "license"
    if re.search(r"passport", text.lower()):
        return "passport"
    if re.search(r"pension", text.lower()):
        return "pension"
    if re.search(r"(^|\s)h((ome)|(ouse))", text.lower()):
        return "home-mention"
    if re.search(r"(employ)|(work)|(job)|(business)|(company)", text.lower()):
        return "work-mention"
    if re.search(r"(benefit)|(universal credit)|(eligible)|(esa)|(ssp)|(pip)|(allowance)", text.lower()):
        return "benefit"
    if re.search(r"(school)|(student)", text.lower()):
        return "school"
    if re.search(r"(food)|(supplies)|(shopping)|(groceries)", text.lower()):
        return "goods"
    if re.search(r"(money)|(grant)|(fund)|(relief)", text.lower()):
        return "given-money"
    if re.search(r"(bill)|(tax)|(mortgage)|(rent)|(loan)|(debt)|(fine)|(fee)|(insurance)", text.lower()):
        return "bills-to-pay"
    if re.search(r"scheme", text.lower()):
        return "scheme"
    if re.search(r"(^|\s)visa($|\s)", text.lower()):
        return "visa"
    if re.search(r"(data)|(cases)|(situation)|(stat(istic)?s?$)|(status)|(news)|(progress)", text.lower()):
        return "data"
    if re.search(r"dea((th)|d)", text.lower()):
        return "death"
    return "unknown"


In [22]:
regex_for_theme("stats")

'data'

In [23]:
def regex_group_verbs(verb):
    if re.search(r"""(f(i|(ou))nd)|(look)|(search)|(clarify)|(ask)|(read)|([ei]nquire)|"""+
                 """(obtain)|(seek)|(know)|((^|\s)see($|\s))|(understand)""", verb):
        return "find-smthg"
    if re.search(r"(access)|(check)|(complete)|(cancel)|(book)|(confirm)", verb):
        return "access-smthg"
    if re.search(r"(get)|(take)|(claim)|(receive)|(sent)|(collect)", verb):
        return "acquire-smthg"
    if re.search(r"(renew)|(change)|(update)|(inform$)|(notify)", verb):
        return "change-smthg"
    if re.search(r"(appl(y|(ied)))|(register)|(qualify)|(sign)", verb):
        return "apply-smthg"
    if re.search(r"pa(y|(id)|(yed))", verb):
        return "pay-smthg"
    if re.search(r"(contact)|(report)", verb):
        return "contact-smthg"
    if re.search(r"(work)|(employ)", verb):
        return "work-smwhr"
    if re.search(r"(need)|(want)|(require)|(request)|(would like)|(order)", verb):
        return "need-smthg"
    if re.search(r"(have)|((a|'|’|^)m($|\s))|(feel($|\s))", verb):
        return "my-situation"
    if re.search(r"(has)|(((a|we)|'|’|^)re($|\s))", verb):
        return "others-situation"
    if re.search(r"(had)|((i|'|’|^)s($|\s))|(was)", verb):
        return "unclear-situation"
    if re.search(r"travel", verb):
        return "travel"
    if re.search(r"(liv(e|(ing)))|(stay)", verb):
        return "living"
    if re.search(r"(do)|(make)", verb):
        return "do-smthng"
    if re.search(r"go($|\s)", verb):
        return "go-smwhr"
    if re.search(r"(give)|(provide)", verb):
        return "give-smthng"
    if re.search(r"(help)|(protect)|(support)", verb):
        return "help"
    return "unknown"

## Test run code.

In [24]:
example = df.iloc[7]
example = df[df.Q3_x.str.contains("letter")].iloc[0]
print(f"Themetatic category for entire comment: {regex_for_theme(example.Q3_x)}")

print(example.Q3_x)
print()
print(example.pos_tag)
print()
for sent in extract_phrase(example.pos_tag, False):
    for chunk in sent:
        theme = ""
        if chunk.label in ["verb"]:
            theme = regex_group_verbs(chunk.text.lower())
        if chunk.label in ["noun", "prep_noun", "noun_verb"]:
            theme = regex_for_theme(chunk.text.lower()) 
            
        print("{0:10} {1:35} {2:20} {3}".format(chunk.label.upper(), chunk.text, theme, chunk.indices))
    print()
    for combo in compute_combinations([sent], 2):
        print(f"{combo[0].text}, {combo[1].text}")
        
#     for combo in compute_combinations([sent], 3):
#         print(f"{combo[0].text}, {combo[1].text}, {combo[2].text}")
    print("=====")

Themetatic category for entire comment: vulnerable
I have heard about free food parcels for the extremely-vulnerable, I have received a letter to stay in but have no idea how to request a parcel for I believe toiletries etc

[[('I', 'PRP', '-PRON-'), ('have', 'VBP', 'have'), ('heard', 'VBN', 'hear'), ('about', 'IN', 'about'), ('free', 'JJ', 'free'), ('food', 'NN', 'food'), ('parcels', 'NNS', 'parcel'), ('for', 'IN', 'for'), ('the', 'DT', 'the'), ('extremely', 'RB', 'extremely'), ('-', 'HYPH', '-'), ('vulnerable', 'JJ', 'vulnerable'), (',', ',', ','), ('I', 'PRP', '-PRON-'), ('have', 'VBP', 'have'), ('received', 'VBN', 'receive'), ('a', 'DT', 'a'), ('letter', 'NN', 'letter'), ('to', 'TO', 'to'), ('stay', 'VB', 'stay'), ('in', 'RB', 'in'), ('but', 'CC', 'but'), ('have', 'VBP', 'have'), ('no', 'DT', 'no'), ('idea', 'NN', 'idea'), ('how', 'WRB', 'how'), ('to', 'TO', 'to'), ('request', 'VB', 'request'), ('a', 'DT', 'a'), ('parcel', 'NN', 'parcel'), ('for', 'IN', 'for'), ('I', 'PRP', '-PRON-

I have a mother in law who is 90yrs old, lives on her own, has diabetes and asthma, 
so there for self isolating. I am checking on her daily, taking food etc. 
I go there once a week to change her bedding, which she is safely in her conservatory. 
I work for a train company but not in a safety critical role, 
I only serve drinks/snacks, which has been suspended now. 
They want me to come in to clean inside trains ( where the public are) 
and the stations. I have said that I don’t want to come in as caring for my mother in law. They have told me 
that I will have to go off sick and get a sick note from GP? 
Then I will only get statutory sick pay! I don’t want to run the risk 
of infecting my at risk elderly parent. Where do I stand?

## Inspect arg1-arg2 grammatical patterns

In [25]:
df.pos_tag[1][0][0]

('Signed', 'VBN', 'sign')

In [37]:
pattern_d = compute_linguistic_patterns(df.pos_tag, 2)
len(pattern_d)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))




73

In [27]:
patterns_of_interest = [('verb', 'noun'),
('noun', 'prep_noun'),
('prep_noun', 'prep_noun'),
('verb', 'noun_verb'),
('verb', 'prep_noun'),
('noun', 'noun_verb'),
('noun_verb', 'prep_noun')
]

## Compute `arg1` - `arg2` co-occurrence db - couples

In [28]:
pattern_db = {}

for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
#         arg2 = " ".join([w.lower() for w,_ in combo[1].tagable_words()])
        
        if key not in pattern_db.keys():
            pattern_db[key] = {}
        if arg1 not in pattern_db[key].keys():
            pattern_db[key][arg1] = Counter()
            
        pattern_db[key][arg1][arg2]+=1

print(f"There are {len(pattern_db)} possible grammatical combos.")
for i, (k,v) in enumerate(sorted(pattern_db.items(),
                         key = lambda x: len(x[1].values()),
                         reverse= True)[0:15],
                                 1):
    print(k, len(v))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))


There are 73 possible grammatical combos.
('verb', 'noun') 9341
('noun', 'punct') 4251
('noun', 'prep_noun') 3513
('noun', 'verb') 3435
('verb', 'pronoun') 2649
('verb', 'adjective') 2499
('noun', 'cc') 2352
('prep_noun', 'punct') 2319
('verb', 'punct') 1780
('prep_noun', 'noun') 1594
('noun', 'pronoun') 1556
('prep_noun', 'verb') 1484
('prep_noun', 'prep_noun') 1403
('prep_noun', 'cc') 1064
('noun', 'rb') 1027


## List of verbs currently not being captured/categorized with regex

In [29]:
top_100_verbs = [key.lower() for key, value in sorted(pattern_db[('verb', 'noun')].items(), 
                         key = lambda x: sum(x[1].values()), 
                         reverse= True)[0:100]]
counter = 0
for verb in top_100_verbs:
    if regex_group_verbs(verb)== "unknown":
        counter+=1
        print(counter, verb)
        

1 got
2 to tax
3 can
4 came to
5 having
6 driving
7 use
8 regarding
9 says
10 say
11 following
12 to send


## Create co-occurrence tables for `verb_theme-verb`, `verb_theme-argument_theme`, argument_theme-argument` pairs

In [30]:
def update_argument_theme_dictionary(dict_new, dict_old):
    for theme, value in dict_new.items():
        if theme not in dict_old.keys():
            dict_old[theme] = Counter()
        for val,count in value.items():
            dict_old[theme][val]+=count

In [47]:
verb_themes = {}
verb_argument_themes = {}
argument_themes = {}

verb_argument_fillers = {}

for pattern in [('verb', 'noun'), ('verb', 'prep_noun')]:
    print(f"There are {len(pattern_db[pattern])} {pattern[0]}s, accompanied by {pattern[1]}s.")
    for i, (arg1, arg2) in enumerate(sorted(pattern_db[pattern].items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     1):
        verb_theme = f"{regex_group_verbs(arg1)}".upper()

        if verb_theme not in verb_themes.keys():
            verb_themes[verb_theme] = Counter()
        
        verb_themes[verb_theme][arg1] += sum(arg2.values())  
        
#         print(f"{i}. {arg1} :: {sum(arg2.values())} [{verb_theme}] \n-----------")
        
        if verb_theme not in verb_argument_themes.keys():
            verb_argument_themes[verb_theme] = {}

        local_themes = {}
        
        for j, (arg2_val, arg2_counts) in enumerate(arg2.items(), 1):
            theme = f"{regex_for_theme(arg2_val)}".upper()
            if theme not in local_themes.keys():
                local_themes[theme] = Counter()
                
            if theme not in argument_themes.keys():   
                argument_themes[theme] = Counter()   
                
            local_themes[theme][arg2_val]+=arg2_counts   
            argument_themes[theme][arg2_val]+=arg2_counts  
            
            verb_argument_theme = (verb_theme, theme)
            
            if verb_argument_theme not in verb_argument_fillers.keys():
                verb_argument_fillers[verb_argument_theme] = {}
                
            if arg1 not in verb_argument_fillers[verb_argument_theme].keys():
                verb_argument_fillers[verb_argument_theme][arg1] = Counter() 
                
            verb_argument_fillers[verb_argument_theme][arg1][arg2_val] += arg2_counts
            
            
        update_argument_theme_dictionary(local_themes, verb_argument_themes[verb_theme])
        
#         for l, (key,value) in enumerate(sorted(local_themes.items(),
#                              key = lambda x: sum(x[1].values()),
#                              reverse= True)[0:10],
#                                      1):
#             print(f"{l}. {key}:: {sum(value.values())}")
#             for argument, count in value.most_common(5):
#                 print(f"{argument}: {count}")
#             print("")
#         print("=======")
        

There are 9341 verbs, accompanied by nouns.
There are 358 verbs, accompanied by prep_nouns.


In [40]:
len(argument_themes), len(verb_themes)

(44, 19)

In [67]:
for key, value in list(verb_argument_fillers.items())[0:10]:
    print(key, sum([vs for v in value.values() for vs in v.values()]))

('MY-SITUATION', 'FAMILY') 52
('MY-SITUATION', 'HEALTH-PROBLEM') 272
('MY-SITUATION', 'INFORMATION') 136
('MY-SITUATION', 'UNCERTAINTY') 30
('MY-SITUATION', 'NO-INCOME') 46
('MY-SITUATION', 'NO-ONE') 60
('MY-SITUATION', 'CARER') 19
('MY-SITUATION', 'CORRESPONDENCE') 93
('MY-SITUATION', 'TRAVEL') 9
('MY-SITUATION', 'UNKNOWN') 1569


### Create a dataframe with 20 most frequently occurring generic phrases and their components

In [84]:
row_list = []
for pattern, values in sorted(verb_argument_fillers.items(),
                           key = lambda x: sum([vs for v in x[1].values() for vs in v.values()]),
                           reverse = True)[0:100]:
    if 'UNKNOWN' not in pattern:
        total_occ = sum([vs for v in values.values() for vs in v.values()])
        print(pattern, total_occ)
        
        local_arguments = Counter()
        local_verbs = Counter()
        
        for verb, argument_values in sorted(verb_argument_fillers[pattern].items(),
                                   key = lambda x: sum(x[1].values()),
                                   reverse = True):
#             print(verb, sum(argument_values.values()))
            local_verbs[verb] += sum(argument_values.values())
#             print(argument_values.most_common(5))
            for argument, count in argument_values.most_common():
                local_arguments[argument] += count
        
        verbs = [f"{k} : {v}" for k,v in local_verbs.most_common()]
        arguments = [f"{k} : {v}" for k,v in local_arguments.most_common()]
        
        print("verbs:\n", verbs[0:10])
        print("arguments:\n", arguments[0:10])
        
        row_list.append({"Generic phrase": f"{pattern[0]} - {pattern[1]}",
                         "# of times occured": total_occ,
                        "Verb category": f"{pattern[0]}",
                         "Total # of verbs" : sum(local_verbs.values()),
                         "# of unique verbs" : len(local_verbs),
                        "Verb values": "\n".join(verbs[0:10]),
                        "Argument category": f"{pattern[1]}",
                        "Total # of arguments" : sum(local_arguments.values()),
                         "# of unique arguments" : len(local_arguments),
                        "Argument values": "\n".join(arguments[0:10])})
        print("====")

('FIND-SMTHG', 'INFORMATION') 616
verbs:
 ['to find : 109', 'looking for : 80', 'was looking for : 39', 'can not find : 29', 'to find out : 24', 'to look for : 22', 'find : 18', "ca n't find : 13", 'trying to find : 13', 'am looking for : 11']
arguments:
 ['information : 121', 'advice : 63', 'any information : 24', 'guidance : 24', 'info : 20', 'an answer : 13', 'details : 13', 'an email address : 12', 'a link : 9', 'the form : 9']
====
('ACQUIRE-SMTHG', 'DELIVERY') 325
verbs:
 ['to get : 88', 'can not get : 52', 'get : 29', 'can get : 26', "ca n't get : 15", 'ca n’t get : 14', 'could get : 6', 'ca nt get : 5', 'in getting : 4', 'trying to get : 3']
arguments:
 ['a slot : 39', 'a delivery slot : 34', 'a delivery : 21', 'home delivery : 13', 'deliveries : 10', 'a home delivery : 9', 'delivery slots : 8', 'a food delivery : 8', 'food deliveries : 8', 'home deliveries : 7']
====
('ACQUIRE-SMTHG', 'CORRESPONDENCE') 298
verbs:
 ['received : 86', 'have not received : 38', 'have received : 33

In [86]:
pd.DataFrame(row_list)[0:20].to_csv(os.path.join(DATA_DIR, "generic_phrase_counts_top_20.csv"), index=False)

### Overview themes + values assigned

In [72]:
for i, (key,value) in enumerate(sorted([(k,v) for k,v in verb_argument_themes.items() if k != "UNKNOWN"],
                                       key = lambda x: sum([sum(counter.values()) for counter in x[1].values()]),
                                      reverse=True),1):

    print(f"{i}. {key} {sum([sum(counter.values()) for counter in value.values()])} {len(value)} \n======")
    for j, (argument, counter) in enumerate(sorted([(k,v) for k,v in value.items() if k != "UNKNOWN"],
                                                   key = lambda x: sum(x[1].values()),
                                                   reverse=True
                                                  )
                                            , 1):

        print(f"{argument} {sum(counter.values())}")
        for l, (arg_theme, vals) in enumerate(counter.most_common(5)):
            print(f"{arg_theme}: {vals}")
        print("---")
    print()
            

1. MY-SITUATION 3208 43 
HEALTH-PROBLEM 272
copd: 42
asthma: 20
diabetes: 15
health problems: 11
cancer: 11
---
INFORMATION 136
the list: 13
list: 11
the form: 6
details: 4
a number: 3
---
WORK-MENTION 130
work: 42
a job: 6
an employee: 4
job: 4
2 jobs: 3
---
CORRESPONDENCE 93
a letter: 48
letter: 7
letters: 4
email: 2
letter.no: 1
---
ELDERLY 93
77 years: 8
73 years: 8
81 years: 5
80 years: 5
71 years: 5
---
VULNERABLE 78
a vulnerable person: 26
extremely vulnerable person: 5
the vulnerable list: 4
extremely vulnerable group: 4
the vulnerable category: 3
---
NO-ONE 60
nothing: 24
no one: 16
no family: 15
nobody shops: 1
nothing other: 1
---
HOME-MENTION 55
home: 27
housebound: 5
house: 5
the house: 3
home sats: 1
---
FAMILY 52
husband: 4
a single parent: 4
wife: 3
family: 2
mother: 2
---
NO-INCOME 46
no income: 21
no money: 14
no work: 5
no wages: 3
no money national insurance number: 1
---
BILLS-TO-PAY 46
a mortgage: 4
a tax rebate: 4
bills: 3
debt: 2
tax return: 2
---
COVID-MENTION 

nothing: 1
alone with no one: 1
---
VULNERABLE 3
a vulnerable person: 3
---
ELDERLY 3
85 year: 1
78 year: 1
93 year: 1
---
HEALTH-PROBLEM 3
asthma: 1
no underlying health problems: 1
health reasons: 1
---
GET-MED 3
medication: 1
meds: 1
husbands prescription: 1
---
DELIVERY 2
slot: 1
slots: 1
---
SUPPORT 2
some support thk: 1
any support: 1
---
BENEFIT 2
ssp: 2
---
UNCERTAINTY 1
a way: 1
---
AT-RISK 1
a high risk person: 1
---
KEY-WORKER 1
dad not a key worker: 1
---
CHILD 1
daughter: 1
---
BILLS-TO-PAY 1
feet: 1
---
GIVEN-MONEY 1
no funds: 1
---
CORRESPONDENCE 1
letter: 1
---
SELF-ISOLATION 1
isolation: 1
---
SCHOOL 1
all students: 1
---

14. CONTACT-SMTHG 237 18 
WORK-MENTION 9
employer: 3
work: 2
a company: 1
business open: 1
the jobcentre: 1
---
DEATH 9
a death: 5
the death: 2
a death bereavement: 1
death: 1
---
CHILD 6
son: 2
child maintenance: 2
the child maintenance service: 1
child benefit department: 1
---
FAMILY 5
fathers deah: 1
mum death: 1
the other parent: 1
a parent: 1
a

## Assign themes to actions and things people are talking about 
### Tag response comments (Q3) with appropriate themes

In [36]:
phrase_mentions = []
for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    phrase_mentions.append([])
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
        
        if key in [('verb', 'noun'), ('verb', 'prep_noun'), 
                   ('verb', 'noun_verb'), ('noun','prep_noun'),
                  ('prep_noun','noun'), ('prep_noun','prep_noun')]:
            mention_theme = f"{regex_group_verbs(arg1)} - {regex_for_theme(arg2)}"
            
            arg1 = re.sub(r"\(|\)|\[|\]|\+", "", arg1)
            arg2 = re.sub(r"\(|\)|\[|\]|\+", "", arg2)
            phrase = f"{arg1} {arg2}"
            phrase_mentions[-1].append((key, phrase, mention_theme, (arg1,arg2)))
            
df['theme_mentions'] = phrase_mentions       

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10506.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
df[df['theme_mentions'].str.len()>0].iloc[100].theme_mentions

In [None]:
df['theme_mentions_list'] = df['theme_mentions'].map(lambda x: [mention for key,_,mention,_ in x])

In [None]:
def get_user_group(arg1, arg2):
    if re.search(r"((('|’|^(a)?)m)|(have been)|(feel))$", arg1):
        return re.sub(r"^((the)|a)\s","", arg2)
    return ""

def resolve_function(x):
    res = [get_user_group(*args) for theme,_,_,args in x if "verb" in theme[0]]
    return [r for r in res if r != ""]

In [None]:
args = [[('verb', 'noun'),None,None,('feel', 'the key-worker')]]
resolve_function(args)

In [None]:
df['theme_mentions_user'] = df['theme_mentions'].map(resolve_function)

In [None]:
user_groups = Counter()
for vals in df[df['theme_mentions_user'].str.len()>0].theme_mentions_user.values:
    for val in vals:
        user_groups[val] +=1
user_groups.most_common(10), "housebound" in user_groups, len(user_groups)

In [None]:
import regex
from difflib import SequenceMatcher as SM
from nltk.util import ngrams
import codecs

needle = "told register as a vulnerable person"
hay    = "told to register as a vulnerable person for delivery service for on line shopping"

def find_needle(needle, hay):
    needle_length  = len(needle.split())
    max_sim_val    = 0
    max_sim_string = u""
#     print(needle)
    for ngram in ngrams(hay.split(), needle_length + int(.65*needle_length)):
        hay_ngram = u" ".join(ngram)
        similarity = SM(None, hay_ngram, needle).ratio() 
        if similarity > max_sim_val:
            max_sim_val = similarity
            max_sim_string = hay_ngram
    
    if max_sim_string == "":
        max_sim_string = hay

    tokens = needle.split(" ")
    if len(tokens) == 1:
        expression = tokens[0]
    else:
        expression = f"({tokens[0]}).*({tokens[-1]})"
    result = regex.search(expression, max_sim_string)
    
    if result is not None:
        pattern = result.group()
        
        return {needle: pattern}
    return {needle:None}

print(find_needle(needle, hay))
print(find_needle("housebound", "i am housebound"))

In [None]:
df[['theme_mentions', "Q3_pii_removed"]].iloc[142].values

In [None]:
## remove special characters
df["Q3_pii_removed"] = df["Q3_pii_removed"].replace(np.nan, '', regex=True)
df["Q3_pii_removed"] = df["Q3_pii_removed"].progress_map(lambda x: ' '.join(
                                    re.sub(r"\(|\)|\[|\]|\+", "", x).split()))

df["Q3_x_edit"] = df["Q3_x"].replace(np.nan, '', regex=True)
df["Q3_x_edit"] = df["Q3_x_edit"].progress_map(lambda x: ' '.join(re.sub(r"\(|\)|\[|\]|\+", "", x).split()))

## Create columns for `phrases` and `user_groups`

In [None]:
df[['theme_mentions', "Q3_x_edit"]].iloc[6].values

In [None]:
df['phrases_dict'] = df[['theme_mentions', "Q3_x_edit"]][:].\
            progress_apply(lambda x: [find_needle(phrase, x[1].lower()) for _,phrase,_,_  in x[0]], axis=1)
df['phrases_list'] = df['phrases_dict'].progress_map(lambda x: [value for phrase_dict in x 
                                                                 for value in phrase_dict.values() 
                                                                 if value is not None]
                                                if not isinstance(x, float) else [])
df['phrases'] = df['phrases_list'].progress_map(lambda x: ", ".join(x))

In [None]:
phrase_counts = Counter()
for phrase_list in df.phrases_list.values:
    for phrase in phrase_list:
        phrase_counts[phrase]+=1
phrase_counts.most_common(50)

In [None]:
df[df['phrases']!=''][['phrases', 'Q3_x_edit']].head(10)

In [None]:
df['user_phrases_dict'] = df[['theme_mentions_user', "Q3_x_edit"]][:].\
            progress_apply(lambda x: [find_needle(phrase, x[1].lower()) for phrase  in x[0]], axis=1)
df['user_phrases_list'] = df['user_phrases_dict'].progress_map(lambda x: [value for phrase_dict in x 
                                                                 for value in phrase_dict.values() 
                                                                 if value is not None]
                                                if not isinstance(x, float) else [])

df['user_phrases'] = df['user_phrases_list'].progress_map(lambda x: ", ".join(x))

In [None]:
user_groups = Counter()
for vals in df[df['user_phrases_list'].str.len()>0].user_phrases_list.values:
    for val in vals:
        user_groups[val] +=1
user_groups.most_common(10), "housebound" in user_groups, len(user_groups)

### Inspect missing stuff

In [None]:
missing = 0
for phrase_list, comment in df[~df['phrases_dict'].isna()][['phrases_dict', 'Q3_x_edit']].values:
    for phrase_dict in phrase_list:
        for key,value in phrase_dict.items():
            if str(value) not in comment.lower():
                missing+=1
missing       

## Save results for tool

In [None]:
df_sub = df[['primary_key', 'intents_clientID', 'visitId', 'fullVisitorId',
       'hits_pagePath', 'Started', 'Ended', 'Q1_x', 'Q2_x', 'Q3_x_edit', 'Q4_x',
       'Q5_x', 'Q6_x', 'Q7_x', 'Q8_x', 'session_id', 'dayofweek', 'isWeekend',
       'hour', 'country', 'country_grouping', 'UK_region', 'UK_metro_area',
       'channelGrouping', 'deviceCategory',
       'total_seconds_in_session_across_days',
       'total_pageviews_in_session_across_days', 'finding_count',
       'updates_and_alerts_count', 'news_count', 'decisions_count',
       'speeches_and_statements_count', 'transactions_count',
       'regulation_count', 'guidance_count', 'business_support_count',
       'policy_count', 'consultations_count', 'research_count',
       'statistics_count', 'transparency_data_count',
       'freedom_of_information_releases_count', 'incidents_count',
       'done_page_flag', 'count_client_error', 'count_server_error',
       'ga_visit_start_timestamp', 'ga_visit_end_timestamp',
       'intents_started_date', 'events_sequence', 'search_terms_sequence',
       'cleaned_search_terms_sequence', 'top_level_taxons_sequence',
       'page_format_sequence', 'Sequence', 'PageSequence', 'flag_for_criteria',
       'full_url_in_session_flag', 'UserID', 'UserNo', 'Name', 'Email',
       'IP Address', 'Unique ID', 'Tracking Link', 'clientID', 'Page Path',
       'Q1_y', 'Q2_y', 'Q3_y', 'Q4_y', 'Q5_y', 'Q6_y', 'Q7_y', 'Q8_y',
       'Started_Date', 'Ended_Date', 'Started_Date_sub_12h', 'phrases', 'user_phrases']]

df_sub.rename(columns={'Q3_x_edit':'Q3_x'}, inplace=True)
df_sub.to_csv(os.path.join(DATA_DIR, 'uis_20200401_20200409_phrases_user_groups.csv'), index=False)