In [None]:
# !pip3 install wordcloud
# !pip3 install polyglot
# !pip3 install pyicu
# !pip3 install pycld2
# !pip3 install morfessor
# !pip3 install polyglot
# !pip3 install fuzzywuzzy
# !pip3 install seaborn

In [None]:
import os
import pandas as pd
import sys
import numpy as np 
import spacy
import nltk
from nltk import word_tokenize, sent_tokenize, RegexpParser, tree
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process

from tqdm import tqdm_notebook, tqdm
from collections import Counter
import re
import operator
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from ast import literal_eval

import string 

## https://markhneedham.com/blog/2017/11/28/python-polyglot-modulenotfounderror-no-module-named-icu/
from polyglot.detect import Detector

tqdm.pandas()

## Load in the survey data

In [None]:
DATA_DIR = "../../data"

survey_filename = os.path.join(DATA_DIR, "joined_uis_all_of_march.csv")
df = pd.read_csv(survey_filename)

### Some row duplication present

In [None]:
print(df.shape, df.intents_clientID.nunique(), df.primary_key.nunique(), df.session_id.nunique())

print(df.columns)

df[df.session_id.isna()].shape

In [None]:
df.drop_duplicates("primary_key", inplace = True)
df.reset_index(inplace=True, drop=True)

## Functions for sentence tokenization, part of speech tagging, PII placeholder stripping, ngram computation

In [None]:
nlp = spacy.load("en_core_web_sm")

pii_filtered = ["DATE_OF_BIRTH", "EMAIL_ADDRESS", "PASSPORT", "PERSON_NAME", 
                "PHONE_NUMBER", "STREET_ADDRESS", "UK_NATIONAL_INSURANCE_NUMBER", "UK_PASSPORT"]
pii_regex = "|".join([f"\\[{p}\\]" for p in pii_filtered])
pii_regex

In [None]:
stop_words = list(stopwords.words('english'))
punctuation = list(string.punctuation) + ['’']
token_blacklist = stop_words + punctuation + pii_filtered

def split_sentences(comment):
    return nltk.sent_tokenize(comment)

def remove_stopwords_punctation(sentences):
    return [[(t[0], t[1], t[2]) for t in sent if t[0].lower() not in token_blacklist] for sent in sentences]

def replace_pii_regex(text):
    return re.sub(pii_regex, "", text)

def compute_ngrams(processed_comment, n, stemming=False, filtering=False):
    # processed_comment = part_of_speech_tag(comment)
    if filtering:
        processed_comment = remove_stopwords_punctation(processed_comment)
    index = 2 if stemming else 0
    tokens = [token[index] for sent in processed_comment for token in sent]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram).lower() for ngram in ngrams]


def part_of_speech_tag(comment):
    sentences = split_sentences(comment)
    return [[(token.text, token.tag_, token.lemma_) for token in nlp(sentence)] for sentence in sentences]

In [None]:
t = "This is a test with punctuation’. this is another sentence."
processed_t = part_of_speech_tag(t)
compute_ngrams(processed_t, 2, stemming=False, filtering=True)

## Detect feedback language
There is a bit of foreign language spam in some responses, detect non (primarily) english comments and drop

In [None]:
def detect_language(text):
    if text!="-":
        try:
            langs = {language.confidence:language.code for language in Detector(text, quiet=True).languages}
            return langs[max(langs.keys())]
        except:
            return f"[ERROR] {text}"
    return "-"

In [None]:
df['Q3_pii_removed'] = df['Q3'].progress_map(replace_pii_regex)
df = df[(df.Q3_pii_removed.str.len()<4000)]
df['language'] = df['Q3_pii_removed'].progress_map(detect_language)

In [None]:
lang_dist = df['language'].value_counts().to_dict()
print(f"Number of unique languages: {len(lang_dist)}")
print(f"English: {round((lang_dist['en']*100)/sum(lang_dist.values()), 2)}%")
print(f"-: {round((lang_dist['-']*100)/sum(lang_dist.values()), 2)}%")
list(lang_dist.items())[0:10]

In [None]:
df['is_en'] = df['language'].isin(["en", "un", "-", "sco"])

### Part of speech tag
Run this the first time and save, then just load df

In [None]:
# df['pos_tag'] = df[['Q3_pii_removed', 'is_en']].progress_apply(lambda x: part_of_speech_tag(x[0]) 
#                                                      if x[1] else [], axis=1)
# df['lemmas'] = df['pos_tag'].progress_map(lambda x: [token[2] for sent in x for token in sent])

# df['words'] = df['pos_tag'].progress_map(lambda x: [token[0] for sent in x for token in sent])

# df.to_csv(os.path.join(DATA_DIR, "joined_uis_all_of_march_lang_pos.csv"), index=False)
df = pd.read_csv(os.path.join(DATA_DIR, "joined_uis_all_of_march_lang_pos.csv"))
df['pos_tag'] = df['pos_tag'].map(literal_eval)

## Extract noun and verb phrases

In [None]:
part_of_speech_tag(df.Q3_pii_removed.iloc[0])[0][0]

In [None]:
grammar = r"""
    cc:
    {<CC>}
    pronoun:
    {<DT><IN><PRP>}
    {<IN>?<PRP>}
    noun_verb:
    {<IN>?<JJ.*>*<NN.*>+<HYPH>?<VBD|VBN|VBG><NN.*>*}
    verb:
    {<IN>*<VB.*><IN>}
    {<WRB><TO><VB.*>}
    {<TO><VB.*><IN|RP>?<WRB>}
    {<VB.*><TO><VB.*><RB>?<TO>?}
    {<IN><EX><VB.*>}
    {<RB><TO><VB.*>+}
    {<TO>?<VB.*><IN|WDT|WP>}
    {<WP><VB.*>}
    {<VB.*><RB><VB.*>*}
    {<WDT>?<TO>?<MD|VB.*>?<RB>?<TO|IN>?<V.*>+<CC>?<V.*>*<IN|RP>?<IN>?}
    {<MD><RB>*<VB.*>*}
    {<VB.*><IN|TO><IN>?}
    {<TO><VB.*><IN>+}
    prep_noun:
    {<IN><NN.*><HYPH>?<NN.*>*}
    {<IN>+<PRP\$>?<NN><CD>?}
    {<IN><CD><.*>}
    {<RP>?<IN>+<JJ.*>*<NN.*>+}
    {<IN><DT><NN.*><JJ.*>*<NN><HYPH>?<NN>}
    {<IN><NN.*>(<HYPH>?<NN.*>)?}
    {<JJ.*>*<IN><DT>?<NN.*>+<CD>?<NN.*>?}
    {<IN>+<DT>*<JJ>?<CD>?<NN.*>+<CD>?<NN.*>?}
    noun:
    {<CD><NN.*>}
    {<DT><NN.*>}
    {<JJ.*><NN.*>*<CD>}
    {<NN.*><CD><JJ.*>?}
    {<JJ.*|NN.*><IN|TO><PRP>}
    {<CD><NN.*><JJ.*>}
    {<WRB><RB><JJ.*>*<NN.*>*}
    {<DT><JJ.*>*<NN.*>+}
    {<NN.*><CD>?<JJ.*>*<NN.*>*}
    {<IN>+<CD>*<POS>*<IN>*<NN.*>}
    {<IN><PRP\$>?<JJ.*>*<NN.*>}
    {<NN.*><HYPH><NN.*>}
    {<DT>?<CD>?<JJ.*>?<CC>?<JJ.*>?<NN.*>+}
    {<NN.*><HYPH>?<NN.*|JJ.*|VB.*>*}
    {(<NN|NNS>|<NNP|NNPS>)<NNP|NN|NNS|NNPS>+}
    {(<NN|NNS>+|<NNP|NNPS>+)<IN|CC>(<PRP\$|DT><NN|NNS>+|<NNP|NNPS>+)}
    {<JJ|RB|CD>*<NNP|NN|NNS|NNPS>+}
    {<NNP|NN|NNS|NNPS>+}
    adjective:
    {<RB>*<JJ.*><CD>?}
    """

class Chunk:

    def __init__(self, label, tokens, indices):
        self.label = label
        self.tokens = tokens
        self.indices = indices
        self.text = self.text()
        self.lemma = self.lemma()
        self.important_lemma = self.important_lemma()
        self.important_word = self.important_word()

    def text(self):
        return " ".join([w for w,  _ , _  in self.tokens])
    
    def lemma(self):
        return " ".join([l for _,  _ , l  in self.tokens])
    
    def tagable_words(self):
        return [(w, pos) for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)", pos)]
    
    def important_word(self):
        return " ".join([w for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
    def important_lemma(self):
        return " ".join([l for _,  pos , l  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(CD)", pos) ])
    
parser = RegexpParser(grammar)

def chunk_text(tagged):
    chunks = parser.parse(tagged)
    index = 0
    segments = []
    for el in chunks:
        if type(el) == tree.Tree:
            chunk = Chunk(el.label(), el.leaves(), list(range(index, index + len(el.leaves()))))
            segments.append(chunk)
            index += len(el.leaves())
        else:
            index += 1
    return segments

def extract_phrase(sentences, merge_inplace=False):
    chunks = []
    for sentence in sentences:
        chunks.append(chunk_text(sentence))
    if merge_inplace:
        return [merge_adjacent_chunks(chunk) for chunk in chunks]
    return chunks  

def merge_adjacent_chunks(chunks):
    merged = []
    previous_label = ""
    for chunk in chunks:
        if chunk.label == previous_label and chunk.label != "prep_noun":
            merged[-1] = Chunk(chunk.label, 
                               merged[-1].tokens + chunk.tokens, 
                               merged[-1].indices + chunk.indices)
        else:
            merged.append(chunk)
        previous_label = chunk.label
    return merged

def compute_combinations(sentences, n):
    return [chunks[i:i+n] for chunks in sentences for i in range(len(chunks)-(n-1))]
  

In [None]:
def extract_wordnet_cats(chunk):
    return [(word, tag, wordnet_category(word, tag)) for  word,tag in chunk.tagable_words()]

## Compute linguistic pattern combinations

In [None]:
def compute_wordnet_tags_per_chunk(sentences, wordnet_tags):
     for sent in sentences:
        for chunk in sent:
            tagged_words = extract_wordnet_cats(chunk)
            for word,pos,wordnet_tag in tagged_words:
                if wordnet_tag not in wordnet_tags.keys():
                    wordnet_tags[wordnet_tag] = Counter()
                wordnet_tags[wordnet_tag][word.lower()] +=1

def compute_linguistic_patterns(df_series, n):
    pattern_dictionary = {}
    wordnet_tags = {}

    for vals in tqdm_notebook(df_series.values):
        sents = extract_phrase(vals, True)
        compute_wordnet_tags_per_chunk(sents, wordnet_tags)
                            
        for combo in compute_combinations(sents, n):
            key = tuple([c.label for c in combo])
            counter_key =  tuple([c.text.lower() for c in combo])
            
            if key not in pattern_dictionary.keys():
                pattern_dictionary[key]=Counter()

            pattern_dictionary[key][counter_key]+=1
                        
    return pattern_dictionary, wordnet_tags

## Wordnet categorization of individual arguments

In [None]:
def get_wordnet_pos(pos):
    if pos.startswith("NN"):
        return wn.NOUN
    if pos.startswith("VB"):
        return wn.VERB
    if pos.startswith("RB"):
        return wn.ADV
    if pos.startswith("JJ"):
        return wn.ADJ
    
def wordnet_category(word, pos):
    wn_pos = get_wordnet_pos(pos)
    if re.search("[cCo]((rona)|(vid))", word):
        return "noun.state"
    if len(word.split(" ")) > 1:
        word = word.split(" ")[-1]
    if len(wn.synsets(word, wn_pos))>0 :
        syns = wn.synsets(word, wn_pos)
#         [syn.lexname() for syn in syns]
        return syns[0].lexname()
    return "?"

def bulk_compute_categories(argument_list):
    counter = Counter()
    for argument, counts in argument_list.items():
        wordnet_cat = wordnet_category(argument, "NN")
        if "Tops" in wordnet_cat:
            wordnet_cat = f"noun.{argument.lower().split(' ')[-1]}"
        counter[wordnet_cat] += counts
    return counter

In [None]:
def wordnet_categories(word, pos):
    wn_pos = get_wordnet_pos(pos)
    if re.search("[cCo]((rona)|(vid))", word):
        return "noun.state"
    if len(word.split(" ")) > 1:
        word = word.split(" ")[-1]
    if len(wn.synsets(word, wn_pos))>0 :
        syns = wn.synsets(word, wn_pos)
#         [syn.lexname() for syn in syns]
        return [syn.lexname() for syn in syns]
    return "?"

## Regular expression matches for themes of interest.
Focusing tagging verbs and tagging second argument component of verbs.

In [None]:
def regex_for_theme(text):
    if re.search(r"self\s?(-|\s)\s?employ", text.lower()):
        return "self-employ"
    if re.search(r"(deliver(y|(ies)|(ed)))|(slot)|(online shopping)", text.lower()):
        return "delivery"
    if re.search(r"vulnerable", text.lower()):
        return "vulnerable"
    if re.search(r"disab((led)|(ility))", text.lower()):
        return "disabled"
    if re.search(r"no symptom", text.lower()):
        return "no-symptoms"
    if re.search(r"((corona)?(virus))|(covid)", text.lower()):
        return "covid-mention"
    if re.search(r"""((health)|(heart) (problem)|(issue)|(condition)|(attack)|(disease)|(failure))|( ms)|"""+
                 """(copd)|(asthma)|((type)\s?[12])|(diabet)|"""+
                 """(cancer)|(dementia)|(stroke)|(illness)|(a type$)|(cough)|(leukaemia)""", text.lower()):
        return "health-problem"
    if re.search(r"symptom", text.lower()):
        return "symptoms"
    if re.search(r"((at)?(\s(very\s)?high)?\srisk)|(risk list)", text.lower()):
        return "at-risk"
    if re.search(r"""((((a|'|’)m( (in|at)( my)?)?)|aged) (over(-|\s))?"""+
                 """(([789][0-9]($|s|\s))|(old)|(elderly)))|((over(-|\s))?[789][0-9] y)""", text.lower()):
        return "elderly"
    if re.search(r"(carer)|(care home)", text.lower()):
        return "carer"
    if re.search(r"(key\s?(\s|-)?\s?worker)|(nurse($|\s))|(essential worker)", text.lower()):
        return "key-worker"
    if re.search(r"can\s?(no|'|’)?t work", text.lower()):
        return "cannot-work"
    if re.search(r"no ((work)|(income)|(money)|(wage)|(salar))", text.lower()):
        return "no-income"
    if re.search(r"(furlough)|(fired)|(80 %)", text.lower()):
        return "laid-off"
    if re.search(r"""(((can\s?(no|'|’)?t (get|buy|(shop for)))|"""+
                 """((do not)?ha(ve|d) )(no|any|(not enough))?) (food|groceries))""", 
                 text.lower()):
        return "cannot-get-food"
    if re.search(r"can\s?(no|'|’)?t get ((med)|(prescription))", text.lower()):
        return "cannot-get-med"
    if re.search(r"(^med)|(prescription)", text.lower()):
        return "get-med"
    if re.search(r"(travel(\s(advi[sc]e)|(status))?)|(flight)|(destination)", text.lower()):
        return "travel"
    if re.search(r"""(no\s)(\w*\s)?((info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)|(mention))"""
                 , text.lower()):
        return "no-information"
    if re.search(r"""(info)|(clarification)|(advi[sc]e)|((contact )?((details)|(number)))|"""+
                 """(answer)|(update)|(clarity)|(guid(e|(ance)))|(list)|(definition)|"""+
                 """(address)|(link)|(form)|(contact)"""
                 , text.lower()):
        return "information"
    if re.search(r"""(no)\s((letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|"""+
                 """(alert)|(notice)|(communication))""", text.lower()):
        return "no-correspondence"
    if re.search(r"(letter)|(t(e)?xt)|(message)|(e(\s|(\s?-\s?))?mail)|(alert)|(notice)", text.lower()):
        return "correspondence"
    if re.search(r"(no\s?((family)|(one)))|(nothing)|(nobody)", text.lower()):
        return "no-one"
    if re.search(r"no ((support)|(aid)|(help)|(assistance)|(access)|(priority))", text.lower()):
        return "no-support"
    if re.search(r"(support)|(aid)|(help)|(assistance)|(access)|(priority)", text.lower()):
        return "support"
    if re.search(r"(child)|((^|\s)son)|(daughter)", text.lower()):
        return "child"
    if re.search(r"""(parent)|(husband)|(wife)|(partner)|"""+
                 """((mo|fa)ther)|(famil(y|(ies)))|(m[uo]m)|(dad)""", text.lower()):
        return "family"
    if re.search(r"(rule)|(restriction)|(measure)|(rights)", text.lower()):
        return "rules"
    if re.search(r"((no)|(a(ny)?)) ((way)|(option)|(choice)|(means)|(idea))", text.lower()):
        return "uncertainty"
    if re.search(r"work ((for)|(in)|(at)|(on))", text.lower()):
        return "work"
    if re.search(r"((self\s|-)?isolat((ion)|(e)|(ing)))|(lock\s?(\s|-)?\s?down)", text.lower()):
        return "self-isolation"
    if re.search(r"(driv(ing|ers)\s)?licen[sc]e", text.lower()):
        return "license"
    if re.search(r"passport", text.lower()):
        return "passport"
    if re.search(r"pension", text.lower()):
        return "pension"
    if re.search(r"(^|\s)h((ome)|(ouse))", text.lower()):
        return "home-mention"
    if re.search(r"(employ)|(work)|(job)|(business)|(company)", text.lower()):
        return "work-mention"
    if re.search(r"(benefit)|(universal credit)|(eligible)|(esa)|(ssp)|(pip)|(allowance)", text.lower()):
        return "benefit"
    if re.search(r"(school)|(student)", text.lower()):
        return "school"
    if re.search(r"(food)|(supplies)|(shopping)|(groceries)", text.lower()):
        return "goods"
    if re.search(r"(money)|(grant)|(fund)|(relief)", text.lower()):
        return "given-money"
    if re.search(r"(bill)|(tax)|(mortgage)|(rent)|(loan)|(debt)|(fine)|(fee)|(insurance)", text.lower()):
        return "bills-to-pay"
    if re.search(r"scheme", text.lower()):
        return "scheme"
    if re.search(r"(^|\s)visa($|\s)", text.lower()):
        return "visa"
    if re.search(r"(data)|(cases)|(situation)|(stat(istic)?s?$)|(status)|(news)|(progress)", text.lower()):
        return "data"
    if re.search(r"dea((th)|d)", text.lower()):
        return "death"
    return "unknown"


In [None]:
regex_for_theme("stats")

In [None]:
def regex_group_verbs(verb):
    if re.search(r"""(f(i|(ou))nd)|(look)|(search)|(clarify)|(ask)|(read)|([ei]nquire)|"""+
                 """(obtain)|(seek)|(know)|((^|\s)see($|\s))|(understand)""", verb):
        return "find-smthg"
    if re.search(r"(access)|(check)|(complete)|(cancel)|(book)|(confirm)", verb):
        return "access-smthg"
    if re.search(r"(get)|(take)|(claim)|(receive)|(sent)|(collect)", verb):
        return "acquire-smthg"
    if re.search(r"(renew)|(change)|(update)|(inform$)|(notify)", verb):
        return "change-smthg"
    if re.search(r"(appl(y|(ied)))|(register)|(qualify)", verb):
        return "apply-smthg"
    if re.search(r"pa(y|(id)|(yed))", verb):
        return "pay-smthg"
    if re.search(r"(contact)|(report)", verb):
        return "contact-smthg"
    if re.search(r"(work)|(employ)", verb):
        return "work-smwhr"
    if re.search(r"(need)|(want)|(require)|(request)|(would like)|(order)", verb):
        return "need-smthg"
    if re.search(r"(have)|((a|'|’|^)m($|\s))|(feel($|\s))", verb):
        return "my-situation"
    if re.search(r"(has)|(((a|we)|'|’|^)re($|\s))", verb):
        return "others-situation"
    if re.search(r"(had)|((i|'|’|^)s($|\s))|(was)", verb):
        return "unclear-situation"
    if re.search(r"travel", verb):
        return "travel"
    if re.search(r"(liv(e|(ing)))|(stay)", verb):
        return "living"
    if re.search(r"(do)|(make)", verb):
        return "do-smthng"
    if re.search(r"go($|\s)", verb):
        return "go-smwhr"
    if re.search(r"(give)|(provide)", verb):
        return "give-smthng"
    if re.search(r"(help)|(protect)|(support)", verb):
        return "help"
    return "unknown"

## Test run code.

In [None]:
example = df.iloc[7]
example = df[df.Q3.str.contains("shopping advice")].iloc[0]
print(f"Themetatic category for entire comment: {regex_for_theme(example.Q3)}")

print(example.Q3)
print()
print(example.pos_tag)
print()
for sent in extract_phrase(example.pos_tag, True):
    for chunk in sent:
        print("{0:10} {1:20} {2} {3}".format(chunk.label.upper(), chunk.text, chunk.indices, 
                                             extract_wordnet_cats(chunk)))
    print()
    for combo in compute_combinations([sent], 2):
        print(f"{combo[0].text}, {combo[1].text}")
        
#     for combo in compute_combinations([sent], 3):
#         print(f"{combo[0].text}, {combo[1].text}, {combo[2].text}")
    print("=====")

## Inspect arg1-arg2 grammatical patterns

In [None]:
pattern_d, wordnet_tags = compute_linguistic_patterns(df.pos_tag, 2)
pattern_d.keys(), len(pattern_d)

In [None]:
for key,value in sorted(wordnet_tags.items(), 
                        key = lambda x: sum(x[1].values()), 
                        reverse=True):
    print(key, sum(value.values()))
    for i,(k,v) in enumerate(value.most_common(20),1):
        print(f"{i}. {k}: {v}")
    print()

In [None]:
# for i, (k,v) in enumerate(sorted(pattern_d.items(), 
#                                  key = lambda x: sum(x[1].values()), 
#                                  reverse=True), 
#                           1):
#     print(f"{i}. {k}: {sum(v.values())}")

In [None]:
for i, (k,v) in enumerate(sorted(pattern_d.items(), 
                                 key = lambda x: len(x[1]), 
                                 reverse=True), 
                          1):
    print(f"{i}. {' - '.join([ks.upper() for ks in k])} : {len(v)}\n-------------")
    for j, (kk,vv) in enumerate(pattern_d[k].most_common(50), 1):
        print(f"{j}. \'{' '.join([f'[{kks}]' for kks in kk])}\' : {vv}")
    print()
    print("=======\n")

In [None]:
patterns_of_interest = [('verb', 'noun'),
('noun', 'prep_noun'),
('prep_noun', 'prep_noun'),
('verb', 'noun_verb'),
('verb', 'prep_noun'),
('noun', 'noun_verb'),
('noun_verb', 'prep_noun')
]

In [None]:
# counter = 0
for i, (k,v) in enumerate(sorted(pattern_d.items(), 
                                 key = lambda x: len(x[1]), 
                                 reverse=True), 
                          1):
    if k in patterns_of_interest:
#         print(f"{i}. {' - '.join([ks.upper() for ks in k])} : {len(v)}\n-------------")
        for j, (kk,vv) in enumerate(pattern_d[k].most_common(10), 1):
            if regex_for_theme(" ".join(kk)) == "no-income":
                print(regex_for_theme(" ".join(kk)))
                print(f"{j}. \'{' '.join([f'[{kks}]' for kks in kk])}\' : {vv}")
#                 counter+=vv
        print()
        print("=======\n")
# counter

## Compute `arg1` - `arg2` co-occurrence db - couples

In [None]:
pattern_db = {}

for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
#         arg2 = " ".join([w.lower() for w,_ in combo[1].tagable_words()])
        
        if key not in pattern_db.keys():
            pattern_db[key] = {}
        if arg1 not in pattern_db[key].keys():
            pattern_db[key][arg1] = Counter()
            
        pattern_db[key][arg1][arg2]+=1

print(f"There are {len(pattern_db)} possible grammatical combos.")
for i, (k,v) in enumerate(sorted(pattern_db.items(),
                         key = lambda x: len(x[1].values()),
                         reverse= True)[0:15],
                                 1):
    print(k, len(v))

In [None]:
top_100_verbs = [key.lower() for key, value in sorted(pattern_db[('verb', 'noun')].items(), 
                         key = lambda x: sum(x[1].values()), 
                         reverse= True)[0:100]]
counter = 0
for verb in top_100_verbs:
    if regex_group_verbs(verb)== "unknown":
        counter+=1
        print(counter, verb)
        

In [None]:
def update_argument_theme_dictionary(dict_new, dict_old):
    for theme, value in dict_new.items():
        if theme not in dict_old.keys():
            dict_old[theme] = Counter()
        for val,count in value.items():
            dict_old[theme][val]+=count

In [None]:
key = ('verb', 'prep_noun')
# verbs = []patterns_of_interest
verb_themes = {}
verb_argument_themes = {}

for pattern in [('verb', 'noun'), ('verb', 'prep_noun')]:
    print(f"There are {len(pattern_db[pattern])} {pattern[0]}s, accompanied by {pattern[1]}s.")
    for i, (arg1, arg2) in enumerate(sorted(pattern_db[pattern].items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     1):
        verb_theme = f"{regex_group_verbs(arg1)}".upper()

        if verb_theme not in verb_themes.keys():
            verb_themes[verb_theme] = Counter()
        
        verb_themes[verb_theme][arg1] += sum(arg2.values())  
        
#         print(f"{i}. {arg1} :: {sum(arg2.values())} [{verb_theme}] \n-----------")
        
        if verb_theme not in verb_argument_themes.keys():
            verb_argument_themes[verb_theme] = {}

        local_themes = {}
        
        for j, (arg2_val, arg2_counts) in enumerate(arg2.items(), 1):
            theme = f"{regex_for_theme(arg2_val)}".upper()
            if theme not in local_themes.keys():
                local_themes[theme] = Counter()
            local_themes[theme][arg2_val]+=arg2_counts   
            
        update_argument_theme_dictionary(local_themes, verb_argument_themes[verb_theme])
#             print(f"{j}. {arg2_val} : {arg2_counts} [{regex_for_theme(arg1 +' '+arg2_val)}]")
#         for l, (key,value) in enumerate(sorted(local_themes.items(),
#                              key = lambda x: sum(x[1].values()),
#                              reverse= True)[0:10],
#                                      1):
#             print(f"{l}. {key}:: {sum(value.values())}")
#             for argument, count in value.most_common(5):
#                 print(f"{argument}: {count}")
#             print("")
#         print("=======")
        

In [None]:
for i, (verb_type, verb_values) in enumerate(sorted(verb_themes.items(),
                             key = lambda x: sum(x[1].values()),
                             reverse= True),
                                     1):
    print(i, verb_type, sum(verb_values.values()), len(verb_values))
    
#     for verb_value, count in verb_values.most_common(10):
#         print(verb_value, count)

In [None]:
# for i, (key,value) in enumerate(sorted(verb_argument_themes.items(),
#                              key = lambda x: sum(x[1].values()),
#                              reverse= True),
#                                      1):
for i, (key,value) in enumerate(sorted(verb_argument_themes.items(),
                                       key = lambda x: sum([sum(counter.values()) for counter in x[1].values()]),
                                      reverse=True),1):
    print(f"{i}. {key} {sum([sum(counter.values()) for counter in value.values()])} \n======")
#     if k == "UNKNOWN"
    for j, (argument, counter) in enumerate(sorted([(k,v) for k,v in value.items() ],
                                                   key = lambda x: sum(x[1].values()),
                                                   reverse=True
                                                  )[0:10]
                                            , 1):

        print(f"{j}. {argument}: {sum(counter.values())}")
#         for arg_theme, vals in counter.most_common(5):
#             print(f"{arg_theme}: {vals}")
#         print("---")
    print()
            

## Assign themes to actions and things people are talking about 
### Tag response comments (Q3) with appropriate themes

In [None]:
phrase_mentions = []
for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    phrase_mentions.append([])
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].text.lower()
        arg2 = combo[1].text.lower()
        
        if key in [('verb', 'noun'), ('verb', 'prep_noun')]:
            mention_theme = f"{regex_group_verbs(arg1)} - {regex_for_theme(arg2)}"
            phrase_mentions[-1].append((key, f"{arg1} {arg2}", mention_theme))
            
df['theme_mentions'] = phrase_mentions       

In [None]:
df['theme_mentions_list'] = df['theme_mentions'].map(lambda x: [mention for key,_, mention in x])

In [None]:
example = df.iloc[0]
print("text:", example.Q3)
print("identified verb-based themed mentions:", example.theme_mentions)
print("identified themes:", example.theme_mentions_list)

In [None]:
df['ended_date'] = df['Ended'].map(lambda x: "".join(x.split(" ")[0].split("/")[::-1]))
index = sorted(df['ended_date'].unique())

## Build the graph

In [None]:
def fix_index_label(index):
    return f"{index[-2:]}/{index[:2]}"

In [None]:
mention_date = {}
for date, mentions in df[['ended_date', 'theme_mentions_list']].values:
    for mention in mentions:
        if mention not in mention_date.keys():
            mention_date[mention] = Counter()
        mention_date[mention][date] += 1

column_dict = {}
for mention,date_counts in sorted([(k,v) for k,v in mention_date.items() if "unknown" not in k],
                                 key = lambda x: sum(x[1].values()),
                                 reverse=True)[2:10]:
#     if all([exclude not in mention for exclude in ["find-smthg", "information"]]):
#         if "situation" in mention:
    print(mention)
    column_dict[mention] = [date_counts.get(date, 0) for date in index]

maximum_value = max([v for date_counter in [v for k,v in mention_date.items() 
                                            if k in column_dict.keys()] 
                     for v in date_counter.values()])
df2 = pd.DataFrame(column_dict, index=index)
# lines = df2.plot.line(figsize=(20,10))
df2.shape, maximum_value

In [None]:
import random
def generate_dash(n):
    return [(random.randint(2, 8), 0.5, 1, 0.5) for i in range(n)]

In [None]:
sns.set_context("talk", font_scale=1.2)
plt.figure(figsize=(30, 15))

# ax = sns.lineplot(index, 375, color='black', alpha=0.5)
# ax.lines[0].set_linestyle("--")
# plt.axvline(20200324)

palette = sns.color_palette("colorblind", df2.shape[1])
ax = sns.lineplot(data=df2, palette=palette, dashes = generate_dash(df2.shape[1]))

plt.ylabel(r'# of mentions')
plt.xlabel('')

# plt.yticks(np.arange(0, maximum_value, step=50))

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), 
           bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

sns.despine(offset=10)

labels = [fix_index_label(ind.replace("2020", "")) for ind in index]
ax.set_xticklabels(labels, rotation=25)
plt.savefig(os.path.join(DATA_DIR, "exclude_info.png"), bbox_inches='tight')
plt.show()

## Fuzzy matching (not very helpful so far) [OLD]

In [None]:
verbs = []
for k,v in pattern_db.items():
    if k in [("verb", "noun"), ("verb", "prep_noun")]:
        verbs.extend(list(v.keys()))
verbs = list(set(verbs))

In [None]:
## Collect the verbs from the arg1-arg2 combinations
t1 = "can claim"
t2 = "to claim"
print(fuzz.partial_ratio(t1, t2))
print(fuzz.token_sort_ratio(t1, t2))

verb_associations = {}
for verb in tqdm_notebook(sorted(verbs,
                   key = lambda x: len(x.split(" ")))):
    excluded = verbs[:]
    excluded.remove(verb)
#     if verb == "can claim":
#         print(process.extractBests(verb, 
#                                    excluded, 
#                                    scorer=fuzz.token_sort_ratio))
    verb_associations[verb] = process.extractBests(verb, 
                                                   excluded, 
                                                   scorer=fuzz.token_sort_ratio, 
                                                   score_cutoff=80)
# verb_associations

In [None]:
merged = {}
skip = []
for k,v in sorted(verb_associations.items(),
                  key = lambda x: len(x[0])):
    if k not in skip:
        merged[k] = v[:]
        for vs in v:
            if vs[0] in verb_associations.keys():
                skip.append(vs[0])
                for vv in verb_associations[vs[0]]:
                    if vv[0] not in [val for val,_ in merged[k]] and vv[0] != k:
                        merged[k].append(vv)
                        skip.append(vv[0])

## if values intersect, merge
intersection = []
for k,v in merged.items():
    for m,n in merged.items():
        if k!=m:
            if len(set([phrase for phrase,_ in v]).intersection(set([phrase2 for phrase2,_ in n]))):
                print(f"{k}, {m}")
# merged

In [None]:
i = 1
rowlist = []
for key in [key for key in pattern_d.keys() ]:
#     print(key)
    for k,v in pattern_d[key].most_common():
        if "delivery" in k[1] or "deliver" in k[0]:
#             rowlist.append({f""})
            print(f"{i}. [{k[0]}] {k[1]} : {v}")
            i+=1
    print()
    i=1

## Compute triples

In [None]:
pd_triples = compute_linguistic_patterns(df.pos_tag, 3)

for i, (k,v) in enumerate(sorted(pd_triples.items(), 
                                 key = lambda x: len(x[1]), 
                                 reverse=True), 
                          1):
    print(f"{i}. {' - '.join([ks.upper() for ks in k])} : {len(v)}\n-------------")
    for j, (args, counts) in enumerate(pd_triples[k].most_common(10), 1):
        print(f"{j}. {args}: {counts}")
    print()
    print("=======\n")