In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy

import re
import os

import ast

from tqdm import tqdm

from nltk.corpus import stopwords
import gensim

In [7]:
# Load nlp model
nlp = spacy.load('en_core_web_lg')

In [8]:
df = pd.read_csv('example.csv')

# Convert the string columns to dictionaries
df['ability'] = df['ability'].apply(ast.literal_eval)
df['ability_filtered'] = df['ability_filtered'].apply(ast.literal_eval)
df['aspects'] = df['aspects'].apply(ast.literal_eval)

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   index             91 non-null     int64 
 1   review            91 non-null     object
 2   ability           91 non-null     object
 3   ability_filtered  91 non-null     object
 4   aspects           91 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.7+ KB
None


Unnamed: 0,index,review,ability,ability_filtered,aspects
0,0,"The staff were incredibly helpful and patient,...","{0: [('staff', 'were helpful'), ('staff', 'wer...","{0: [('staff', 'were helpful'), ('staff', 'wer...",[staff]
1,1,I had a great experience purchasing my phone h...,"{0: [('process', 'was smooth'), ('process', 'w...","{0: [('process', 'was quick')]}",[process]
2,2,"Their selection of phones is amazing, and the ...","{0: [('selection', 'is amazing'), ('price', 'a...","{0: [('selection', 'is amazing')]}","[price, selection]"
3,3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked through setting ...","{0: [('I', 'appreciate walked me')]}",[new device]
4,4,"Great customer service, I left with the phone ...","{0: [('question', 'answered'), ('I', 'left wit...","{0: [('question', 'answered'), ('I', 'left wit...",[question]


In [9]:
def contraction(x):
    flatten = [item for sublist in x.values() for item in sublist]

    temp = []
    for t in flatten:
        temp.append(' '.join(t))

    if len(temp) > 0:
        return '. '.join(temp) + '.'
    return ''
    
corpus = df['ability'].apply(contraction).values

corpus

array(['staff were helpful. staff were patient.',
       'process was smooth. process was quick.',
       'selection is amazing. price are competitive.',
       'I appreciate walked through setting new device. I appreciate walked me.',
       'question answered. I left with phone.',
       'I could not resist upgrading. I could not resist offer amazing deals on phones.',
       'technician fixed issue. I expected. technician fixed faster.',
       'experience really know stuff.',
       'variety was impressive. I found perfect case.',
       'staff was knowledgeable.', 'price were reasonable.',
       'staff really went mile.',
       'service service be Excellent. they helped find.',
       'deal was friendly. staff was friendly.',
       'I love store. buying buying be experience. fixing buying be experience.',
       'I got good deal on old phone.',
       'service are quick. service are reliable.',
       'staff was helpful in setting phone.', 'I m satisfied.',
       'staff was pa

In [10]:
def get_all_token_dep_right(token, dep):
    result = []
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep and t.i > token.i:
            result.append(t)
    return result

# Get token spcific dependency
def get_token_dep(token, dep):
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep:
            return t
    return None

def cross_product_tuple(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: list of tuple
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append((i, j))
    return temp

In [67]:
# Labeling (class, verb/adjective) on sentences
# NOTE:
# - Add sentiment, {adj: {'positive': [], 'negative': [], 'neutral': []}

def meta_data_swot(corpus):
    data = {}

    # Preparation data : UNCOMMENT THE CODE BELOW
    # category = ['verb', 'adj']
    # sentiment = ['positive', 'neutral', 'negative']
    # for c in category:
    #     for s in sentiment:
    #         data[c][s] = []

    for text in corpus:
        doc = nlp(text)
    
        for sent in doc.sents:
            subject = []
            verb = []
            adjv = []
            for token in sent:
                if token.dep_ in ['nsubj', 'nsubjpass']:
                    if token.pos_ != 'PRON':
                        subject.append(token.lemma_)
                        
                    if token.pos_ == 'PRON':
                        if token.text.lower() == 'they':
                            subject.append('staff')
                        elif token.text.lower() in ['i', 'this']:
                            subject.append('store')
    
                    if token.head.pos_ == 'VERB' and (token.head.lemma_ != 'be' or token.head.lemma_ != 'do'):
                        verb.append(token.head.lemma_)
    
                    if token.head.pos_ == 'AUX':
                        adj = get_token_dep(token.head ,'acomp')
                        if adj and len(adj.text) > 3:
                            adjv.append(adj.lemma_)

            if len(verb) > 0:
                temp = cross_product_tuple(subject, verb)
                for label in temp:
                    s, v = label
                    if data.get(s):
                        if data.get(s).get('verb'):
                            data[s]['verb'] += [(v, sent.text)]
                        else:
                            data[s].update({'verb': [(v, sent.text)]})
                    else:
                        data[s] = ({'verb': [(v, sent.text)]})
                                
            if len(adjv) > 0:
                temp = cross_product_tuple(subject, adjv)
                for label in temp:
                    s, a = label
                    if data.get(s):
                        if data.get(s).get('adj'):
                            data[s]['adj'] += [(a, sent.text)]
                        else:
                            data[s].update({'adj': [(a, sent.text)]})
                    else:
                        data[s] = {'adj': [(a, sent.text)]}
                        
    return data

# test = [labeling_text(text) for text in corpus]
test = meta_data_swot(corpus)
test

{'staff': {'adj': [('helpful', 'staff were helpful.'),
   ('patient', 'staff were patient.'),
   ('knowledgeable', 'staff was knowledgeable.'),
   ('friendly', 'staff was friendly.'),
   ('helpful', 'staff was helpful in setting phone.'),
   ('patient', 'staff was patient with questions.'),
   ('informative', 'staff was informative.'),
   ('quick', 'they were quick in setting phone.'),
   ('ready', 'staff are ready.'),
   ('able', 'they were able.'),
   ('apologetic', 'staff was not apologetic.'),
   ('unhelpful', 'staff was unhelpful.'),
   ('rude', 'staff was rude.'),
   ('unprofessional', 'staff was unprofessional.')],
  'verb': [('go', 'staff really went mile.'),
   ('help', 'they helped find.'),
   ('help', 'they helped choose within budget.'),
   ('help', 'they helped choose phone.'),
   ('make', 'staff made.'),
   ('help', 'they helped get great deal.'),
   ('help', 'they helped save lot.'),
   ('resolve', 'they always resolve quickly.'),
   ('resolve', 'they always resolve issu

In [44]:
mapper_classification = {
    'customer service' : ['staff', 'selection', 'support', 'deal', 'representative'],
    'phone service' : ['service', 'process', 'technician', 'experience', 'repair', ],
    'store atmosphere' : ['phone', 'store', 'price', 'variety', 'warranty', 'place', 'policy'],
}

In [61]:
def sentence_per_verb(data, class_):
    temp_data = data[class_]['verb']

    # Counting verb appears
    count_words = {}
    # total_words = 0
    # storage = []
    for element in temp_data:
        key, sentence = element
        if count_words.get(key):
            count, text = count_words.get(key)
            count_words[key] = (count + 1, text + ' ' + sentence)
        else:
            count_words[key] = (1, sentence)
            # total_words += 1
            
    return count_words

example = sentence_per_verb(test, 'staff')
example

{'go': (1, 'staff really went mile.'),
 'help': (8,
  'they helped find. they helped choose within budget. they helped choose phone. they helped get great deal. they helped save lot. they even helped data. they even helped transfer without extra charge. they even helped transfer contacts.'),
 'make': (1, 'staff made.'),
 'resolve': (4,
  'they always resolve quickly. they always resolve issues. they resolved issue. they resolved very quickly professionally.'),
 'fail': (3,
  'they not fail recommend to friends. they not fail recommend to family. they not fail recommend store.'),
 'take': (2, 'staff took time. they did not take responsibility.'),
 'accommodate': (1, 'staff was accommodating.'),
 'give': (3,
  'they even gave discount on repair. staff gave incorrect information about plan. they twice gave refund.'),
 'fix': (2, 'they fixed screen. they fixed perfectly.'),
 'offer': (2, 'they offer discounts. they offer fantastic promotions.'),
 'attend': (1, 'they fast attend to customer

In [65]:
def get_threshold(data, threshold=75):
    temp = []
    for count, _ in data.values():
        temp.append(count)
        
    return np.percentile(temp, threshold)

get_threshold(example)

2.0