In [None]:
# !pip3 install wordcloud
# !pip3 install polyglot
# !pip3 install pyicu
# !pip3 install pycld2
# !pip3 install morfessor
# !pip3 install polyglot
# !pip3 install fuzzywuzzy

In [None]:
import os
import pandas as pd
import sys
import numpy as np 
import spacy
import nltk
from nltk import word_tokenize, sent_tokenize, RegexpParser, tree
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from tqdm import tqdm_notebook, tqdm
from collections import Counter
import re
import operator
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import string 

## https://markhneedham.com/blog/2017/11/28/python-polyglot-modulenotfounderror-no-module-named-icu/
from polyglot.detect import Detector

tqdm.pandas()

## Load in the survey data

In [None]:
DATA_DIR = "../../data"

survey_filename = os.path.join(DATA_DIR, "joined_uis_all_of_march.csv")
df = pd.read_csv(survey_filename)

In [None]:
### Some row duplication present

In [None]:
print(df.shape, df.intents_clientID.nunique(), df.primary_key.nunique(), df.session_id.nunique())

print(df.columns)

df[df.session_id.isna()].shape

In [None]:
df.drop_duplicates("primary_key", inplace = True)
df.reset_index(inplace=True, drop=True)

## Functions for sentence tokenization, part of speech tagging, PII placeholder stripping, ngram computation

In [None]:
nlp = spacy.load("en_core_web_sm")

pii_filtered = ["DATE_OF_BIRTH", "EMAIL_ADDRESS", "PASSPORT", "PERSON_NAME", 
                "PHONE_NUMBER", "STREET_ADDRESS", "UK_NATIONAL_INSURANCE_NUMBER", "UK_PASSPORT"]
pii_regex = "|".join([f"\\[{p}\\]" for p in pii_filtered])
pii_regex

In [None]:
stop_words = list(stopwords.words('english'))
punctuation = list(string.punctuation) + ['’']
token_blacklist = stop_words + punctuation + pii_filtered

def split_sentences(comment):
    return nltk.sent_tokenize(comment)

def remove_stopwords_punctation(sentences):
    return [[(t[0], t[1], t[2]) for t in sent if t[0].lower() not in token_blacklist] for sent in sentences]

def replace_pii_regex(text):
    return re.sub(pii_regex, "", text)

def compute_ngrams(processed_comment, n, stemming=False, filtering=False):
    # processed_comment = part_of_speech_tag(comment)
    if filtering:
        processed_comment = remove_stopwords_punctation(processed_comment)
    index = 2 if stemming else 0
    tokens = [token[index] for sent in processed_comment for token in sent]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram).lower() for ngram in ngrams]


def part_of_speech_tag(comment):
    sentences = split_sentences(comment)
    return [[(token.text, token.tag_, token.lemma_) for token in nlp(sentence)] for sentence in sentences]

In [None]:
t = "This is a test with punctuation’. this is another sentence."
processed_t = part_of_speech_tag(t)
compute_ngrams(processed_t, 2, stemming=False, filtering=True)

## Detect feedback language
There is a bit of foreign language spam in some responses, detect non (primarily) english comments and drop

In [None]:
def detect_language(text):
    if text!="-":
        try:
            langs = {language.confidence:language.code for language in Detector(text, quiet=True).languages}
            return langs[max(langs.keys())]
        except:
            return f"[ERROR] {text}"
    return "-"

In [None]:
df['Q3_pii_removed'] = df['Q3'].progress_map(replace_pii_regex)
df = df[(df.Q3_pii_removed.str.len()<4000)]
df['language'] = df['Q3_pii_removed'].progress_map(detect_language)

In [None]:
lang_dist = df['language'].value_counts().to_dict()
print(f"Number of unique languages: {len(lang_dist)}")
print(f"English: {round((lang_dist['en']*100)/sum(lang_dist.values()), 2)}%")
print(f"-: {round((lang_dist['-']*100)/sum(lang_dist.values()), 2)}%")
list(lang_dist.items())[0:10]

In [None]:
df['is_en'] = df['language'].isin(["en", "un", "-", "sco"])

### Part of speech tag

In [None]:
df['pos_tag'] = df[['Q3_pii_removed', 'is_en']].progress_apply(lambda x: part_of_speech_tag(x[0]) 
                                                     if x[1] else [], axis=1)
df['lemmas'] = df['pos_tag'].progress_map(lambda x: [token[2] for sent in x for token in sent])

df['words'] = df['pos_tag'].progress_map(lambda x: [token[0] for sent in x for token in sent])

df.to_csv(os.path.join(DATA_DIR, "joined_uis_all_of_march_lang_pos.csv"), index=False)

## Extract noun and verb phrases

In [None]:
part_of_speech_tag(df.Q3_pii_removed.iloc[0])

In [None]:
grammar = r"""
    cc:
    {<CC>}
    pronoun:
    {<DT><IN><PRP>}
    {<IN>?<PRP>}
    noun_verb:
    {<IN>?<JJ.*>*<NN.*>+<HYPH>?<VBD|VBN|VBG><NN.*>*}
    verb:
    {<TO><VB><IN|RP><WRB>}
    {<IN><EX><VB.*>}
    {<RB><TO><VB.*>+}
    {<TO>?<VB.*><IN|WDT|WP>}
    {<WP><VB.*>}
    {<VB.*><RB><VB.*>*}
    {<WDT>?<TO>?<MD|VB.*>?<RB>?<TO|IN>?<V.*>+<CC>?<V.*>*<IN|RP>?<IN>?}
    {<MD><RB>*<VB.*>*}
    {<VB.*><IN|TO><IN>?}
    {<TO><VB.*><IN>+}
    prep_noun:
    {<IN>+<PRP\$>?<NN><CD>?}
    {<IN><CD><.*>}
    {<RP>?<IN>+<JJ.*>*<NN.*>+}
    {<IN><DT><NN.*><JJ.*>*<NN><HYPH>?<NN>}
    {<IN><NN.*>(<HYPH>?<NN.*>)?}
    {<JJ.*>*<IN><DT>?<NN.*>+<CD>?<NN.*>?}
    {<IN><DT>*<JJ>?<CD>?<NN.*>+<CD>?<NN.*>?}
    noun:
    {<CD><NN.*>}
    {<DT><NN.*>}
    {<JJ.*><NN.*><CD>}
    {<NN.*><CD><JJ.*>?}
    {<JJ.*|NN.*><IN|TO><PRP>}
    {<CD><NN.*><JJ.*>}
    {<RB>*<JJ.*>*<NN.*>*}
    {<DT><JJ.*>*<NN.*>+}
    {<NN.*><CD>?<JJ.*>*<NN.*>*}
    {<IN>+<CD>*<POS>*<IN>*<NN.*>}
    {<IN><PRP\$>?<JJ.*>*<NN.*>}
    {<NN.*><HYPH><NN.*>}
    {<DT>?<CD>?<JJ.*>?<CC>?<JJ.*>?<NN.*>+}
    {<NN.*><HYPH>?<NN.*|JJ.*|VB.*>*}
    {(<NN|NNS>|<NNP|NNPS>)<NNP|NN|NNS|NNPS>+}
    {(<NN|NNS>+|<NNP|NNPS>+)<IN|CC>(<PRP\$|DT><NN|NNS>+|<NNP|NNPS>+)}
    {<JJ|RB|CD>*<NNP|NN|NNS|NNPS>+}
    {<NNP|NN|NNS|NNPS>+}
    adjective:
    {<RB>*<JJ.*><CD>?}
    """

class Chunk:

    def __init__(self, label, tokens, indices):
        self.label = label
        self.tokens = tokens
        self.indices = indices
        self.text = self.text()
        self.lemma = self.lemma()
        self.important_lemma = self.important_lemma()
        self.important_word = self.important_word()

    def text(self):
        return " ".join([w for w,  _ , _  in self.tokens])
    
    def lemma(self):
        return " ".join([l for _,  _ , l  in self.tokens])
    
    def important_word(self):
        return " ".join([w for w,  pos , _  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(DT)|(RB)|(CD)", pos) ])
    
    def important_lemma(self):
        return " ".join([l for _,  pos , l  in self.tokens if re.search(r"(NN)|(VB)|(JJ)|(DT)|(RB)|(CD)", pos) ])
    
parser = RegexpParser(grammar)

def chunk_text(tagged):
    chunks = parser.parse(tagged)
    index = 0
    segments = []
    for el in chunks:
        if type(el) == tree.Tree:
            chunk = Chunk(el.label(), el.leaves(), list(range(index, index + len(el.leaves()))))
            segments.append(chunk)
            index += len(el.leaves())
        else:
            index += 1
    return segments

def extract_phrase(sentences, merge_inplace=False):
    chunks = []
    for sentence in sentences:
        chunks.append(chunk_text(sentence))
    if merge_inplace:
        return [merge_adjacent_chunks(chunk) for chunk in chunks]
    return chunks  

def merge_adjacent_chunks(chunks):
    merged = []
    previous_label = ""
    for chunk in chunks:
        if chunk.label == previous_label:
            merged[-1] = Chunk(chunk.label, 
                               merged[-1].tokens + chunk.tokens, 
                               merged[-1].indices + chunk.indices)
        else:
            merged.append(chunk)
        previous_label = chunk.label
    return merged

def compute_combinations(sentences, n):
    return [chunks[i:i+n] for chunks in sentences for i in range(len(chunks)-(n-1))]
  

In [None]:
example = df.iloc[7]
example = df[df.Q3.str.contains("age 80")].iloc[0]

print(example.Q3)
print()
print(example.pos_tag)
print()
for sent in extract_phrase(example.pos_tag, True):
    for chunk in sent:
        print("{0:10} {1:20} {2}".format(chunk.label.upper(), chunk.text, chunk.indices))
    print()
    for combo in compute_combinations([sent], 2):
        print(f"{combo[0].text}, {combo[1].text}")
    
#     for combo in compute_combinations([sent], 3):
#         print(f"{combo[0].text}, {combo[1].text}, {combo[2].text}")
    print("=====")

In [None]:
## Compute linguistic pattern combinations

In [None]:
def compute_linguistic_patterns(df_series, n):
    pattern_dictionary = {}

    for vals in tqdm_notebook(df_series.values):
        sents = extract_phrase(vals, True)
        
        for combo in compute_combinations(sents, n):
            key = tuple([c.label for c in combo])
            counter_key =  tuple([c.text.lower() for c in combo])

            if key not in pattern_dictionary.keys():
                pattern_dictionary[key]=Counter()

            pattern_dictionary[key][counter_key]+=1
            
    return pattern_dictionary

In [None]:
pattern_d = compute_linguistic_patterns(df.pos_tag, 2)
pattern_d.keys(), len(pattern_d)

for i, (k,v) in enumerate(sorted(pattern_d.items(), 
                                 key = lambda x: len(x[1]), 
                                 reverse=True), 
                          1):
    print(f"{i}. {' - '.join([ks.upper() for ks in k])} : {len(v)}\n-------------")
    for j, (kk,vv) in enumerate(pattern_d[k].most_common(50), 1):
        print(f"{j}. \'{' '.join([f'[{kks}]' for kks in kk])}\' : {vv}")
    print()
    print("=======\n")

## Find specific mentions
### Find information

In [None]:
bool(re.search("info|advice|guidance|look|seek|obtain", "i am looking for fo", re.IGNORECASE))

In [None]:
str1 = "a delivery slot"
str2 = "online slot"
ratio = fuzz.ratio(str1.lower(), str2.lower())
partial_ratio = fuzz.partial_ratio(str1.lower(), str2.lower())
ratio, partial_ratio

In [None]:
### Self-identifying am/is/are

In [None]:
identifier = Counter()
for pattern in [("verb", "noun"), ("verb", "prep_noun")]:
    print(pattern)
    for k,v in pattern_d[pattern].items():
        if re.search(r"(^a?((\’|\')?m))$", k[0]):
            identifier[k[1]] +=v
identifier.most_common(100)

In [None]:
### Play around with Fuzzywuzzy for imperfect matches (to be aggregated)

In [None]:
str1 = "a delivery slot"
str2 = "online slot"
ratio = fuzz.ratio(str1.lower(), str2.lower())
partial_ratio = fuzz.partial_ratio(str1.lower(), str2.lower())
ratio, partial_ratio

In [None]:
for identity,_ in identifier.most_common(100):
    for identity2,_ in identifier.most_common(100):
        if identity != identity2:
            pr = fuzz.partial_ratio(identity.lower(), identity2.lower())
            if pr >= 80:
                token = fuzz.token_set_ratio(identity, identity2)
                print(f"{identity}, {identity2} : {pr} [{token}]")

In [None]:
### Get help

def has_info_req(text):
    if any([bool(re.search("help|assist|support", word, re.IGNORECASE)) for word in text]):
        return "help"
    if any([bool(re.search("info|advice|guidance|look|seek|obtain", word, re.IGNORECASE)) for word in text]):
        return "info"

In [None]:
info_seeking_verbs = {}
for k,v in pattern_d[("verb", "noun")].items():
    if has_info_req([k[1]])!=None and has_info_req([k[1]])=="help":
        print(has_info_req([k[1]]), k, v)
#         info_seeking_verbs[k[0]]
info_seeking_verbs.most_common(20)

In [None]:
i = 1
rowlist = []
for key in [key for key in pattern_d.keys() ]:
#     print(key)
    for k,v in pattern_d[key].most_common():
        if "delivery" in k[1] or "deliver" in k[0]:
#             rowlist.append({f""})
            print(f"{i}. [{k[0]}] {k[1]} : {v}")
            i+=1
    print()
    i=1

## Compute triples

In [None]:
pd_triples = compute_linguistic_patterns(df.pos_tag, 3)

for i, (k,v) in enumerate(sorted(pd_triples.items(), 
                                 key = lambda x: len(x[1]), 
                                 reverse=True), 
                          1):
    print(f"{i}. {' - '.join([ks.upper() for ks in k])} : {len(v)}\n-------------")
    for j, (args, counts) in enumerate(pd_triples[k].most_common(10), 1):
        print(f"{j}. {args}: {counts}")
    print()
    print("=======\n")

## Compute `arg1` - `arg2` co-occurrence db - couples

In [None]:
pattern_db = {}

for vals in tqdm_notebook(df.pos_tag.values):
    sents = extract_phrase(vals, True)
    for combo in compute_combinations(sents, 2):
        key = (combo[0].label, combo[1].label)
        arg1 = combo[0].important_word.lower()
        arg2 = combo[1].text.lower()
        
        if key not in pattern_db.keys():
            pattern_db[key] = {}
        if arg1 not in pattern_db[key].keys():
            pattern_db[key][arg1] = Counter()
            
        pattern_db[key][arg1][arg2]+=1

print(f"There are {len(pattern_db)} possible grammatical combos.")
for i, (k,v) in enumerate(sorted(pattern_db.items(),
                         key = lambda x: len(x[1].values()),
                         reverse= True)[0:15],
                                 1):
    print(k, len(v))

## Wordnet categorization of individual arguments

In [None]:
def get_wordnet_pos(pos):
    if pos.startswith("NN"):
        return wn.NOUN
    if pos.startswith("VB"):
        return wn.VERB
    
def wordnet_category(word, pos):
    wn_pos = get_wordnet_pos(pos)
    if len(word.split(" ")) > 1:
        word = word.split(" ")[-1]
    if len(wn.synsets(word, wn_pos))>0 :
        syn = wn.synsets(word, wn_pos)[0]
        return syn.lexname()
    return "?"

In [None]:
wordnet_category("no one", "NN")

In [None]:
def bulk_compute_categories(argument_list):
    counter = Counter()
    for argument, counts in argument_list.items():
        wordnet_cat = wordnet_category(argument, "NN")
        if "Tops" in wordnet_cat:
            wordnet_cat = f"noun.{argument.lower().split(' ')[-1]}"
        counter[wordnet_cat] += counts
    return counter

In [None]:
key = ('verb', 'noun')
print(f"There are {len(pattern_db[key])} {key[0]}s, accompanied by {key[1]}s.")
for i, (arg1, arg2) in enumerate(sorted(pattern_db[key].items(),
                         key = lambda x: sum(x[1].values()),
                         reverse= True)[:50],
                                 1):
    print(f"{i}. {arg1} :: {sum(arg2.values())} \n-----------")
    print(bulk_compute_categories(arg2).most_common(10))
    for j, (arg2_val, arg2_counts) in enumerate(arg2.most_common(20), 1):
        print(f"{j}. {arg2_val} : {arg2_counts}")
        print(wordnet_category(arg2_val, "NN"))
    print("=======")