In [250]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [251]:
# Helper get specific token and handling token


##=========== EXTRACT ASPECT ============##
# Cross product two lists
def cross_product_str(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: list of string
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            text = (i + ' ' + j).strip()
            temp.append(text)
    return temp

def cross_product_tuple(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: list of tuple
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append((i, j))
    return temp

# Cross product flatten
def cross_product_flatten(input_1, input_2):
    # Check if input_2 is a list of tuples or a list of lists
    if not isinstance(input_2, list) or not all(isinstance(i, (tuple, list)) for i in input_2):
        raise ValueError("input_2 must be a list of tuples or a list of lists.")

    if type(input_1) == str:
        input_1 = [input_1]
                
    result = []
    for name in input_1:
        for item in input_2:
            result.append((name, *item))
    return result

def cross_product_flatten_append(input_1, input_2):
    # Check if input_2 is a list of tuples or a list of lists
    if not isinstance(input_2, list) or not all(isinstance(i, (tuple, list)) for i in input_2):
        raise ValueError("input_2 must be a list of tuples or a list of lists.")

    if type(input_1) == str:
        input_1 = [input_1]
                
    result = []
    for name in input_1:
        for item in input_2:
            result.append((*item, name))
    return result
    
# Get neglection text
def get_neglect(token):
    if token:
        for t in token.children:
            if (t.dep_ == 'neg') or (t.dep_ == 'det' and t.text.lower() == 'no'):
                return 'not'
    return ''

# Get token specific pos tag
def get_token_pos(token, pos):
    if type(pos) == str:
        pos = [pos]
    for t in token.children:
        if t.pos_ in pos:
            return t
    return None

def get_token_pos_left(token, pos):
    if type(pos) == str:
        pos = [pos]
    for t in token.children:
        if (t.pos_ in pos) and (t.i < token.i):
            return t
    return None

def get_token_pos_right(token, pos):
    if type(pos) == str:
        pos = [pos]
    for t in token.children:
        if (t.pos_ in pos) and (t.i > token.i):
            return t
    return None
    
# Get token spcific dependency
def get_token_dep(token, dep):
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep:
            return t
    return None

def get_token_dep_left(token, dep):
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if (t.dep_ in dep) and (t.i < token.i):
            return t
    return None

def get_token_dep_right(token, dep):
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if (t.dep_ in dep) and (t.i > token.i):
            return t
    return None

def get_all_token_dep(token, dep):
    result = []
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep:
            result.append(t)
    return result

def get_all_token_dep_right(token, dep):
    result = []
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep and t.i > token.i:
            result.append(t)
    return result

def get_all_token_dep_left(token, dep):
    result = []
    if type(dep) == str:
        dep = [dep]
    for t in token.children:
        if t.dep_ in dep and t.i < token.i:
            result.append(t)
    return result

# Crawling all possibile conjunct
def extract_conj(token, neglect=False, lemma=False):
    result = []
    current = get_token_dep(token, dep='conj')
    while current:
        if neglect:
            neg = get_neglect(current)
            # If lemma
            if lemma:
                text = (neg + ' ' + current.lemma_).strip()
            else:
                text = (neg + ' ' + current.text).strip()
                    
            result.append(text)
        else:
            result.append(current.text)
        current = get_token_dep(current, dep='conj')

    return result

def get_all_token_conj(token):
    result = []
    current = get_token_dep(token, dep='conj')
    while current:
        result.append(current)
        current = get_token_dep(current, dep='conj')
    return result

# Get sentences that include coordinating conjunction and its conjunct
def get_text_conj(token):
    # Get all sentence of series include the conjugation
    tokens = [token]
    # Get all token
    tokens += extract_conj(token, all_token=True)

    text = ''
    for i, t in enumerate(tokens):
        text = text + t.text
        if i < len(tokens) - 1:
            if t.dep_ == 'cc':
                text += ' '
            else:
                text += ', '

    # text = text.strip()
    return text

# Crawling all possibile pre modifier object
def extract_pre_amod(token, lemma=False):
    result = []
    current_idx = token.i
    for child in token.children:
        if child.dep_ in ['amod', 'compound'] and child.i < current_idx:
            if lemma:
                result.append((child.lemma_, child.i))
            else:
                result.append((child.text, child.i))

    # Sort by its index
    result = sorted(result, key=lambda x: x[1])

    # Return only list of string
    result = [item[0] for item in result]

    return result

# Crawling all possible post modifier object
def extract_post_amod(token, lemma=False):
    result = []
    current_idx = token.i
    for child in token.children:
        if child.dep_ == 'amod' and child.i > current_idx:
            if lemma:
                result.append((child.lemma_, child.i))
            else:
                result.append((child.text, child.i))

    # Sort by its index
    result = sorted(result, key=lambda x: x[1])

    # Return only list of string
    result = [item[0] for item in result]

    return result

# Crawling all possible adverb
def extract_adv(token, lemma=True):
    conjunctions = [
    # Coordinating conjunctions
    "for", "and", "nor", "but", "or", "yet", "so",
    
    # Subordinating conjunctions
    "although", "because", "since", "if", "when", "while", "before", "after", "unless", "though",
    
    # Correlative conjunctions (listed as single strings)
    "either", "neither", "both", "also", "whether", "as",
    
    # Conjunctive adverbs
    "however", "therefore", "moreover", "consequently", "nevertheless", "thus", "furthermore"
    ]

    result_pre = []
    result_post = []
    current_idx = token.i
    for child in token.children:
        # If pre-position adverb
        if child.pos_ == 'ADV' and child.i < current_idx and child.lemma_.lower() not in conjunctions:
            if lemma:
                result_pre.append((child.lemma_, child.i))
            else:
                result_pre.append((child.text, child.i))

        # If post-position adverb
        if child.pos_ == 'ADV' and child.i > current_idx and child.lemma_.lower() not in conjunctions:
            if lemma:
                result_post.append((child.lemma_, child.i))
            else:
                result_post.append((child.text, child.i))

    # Sort by its index
    result_pre = sorted(result_pre, key=lambda x: x[1])
    result_post = sorted(result_post, key=lambda x: x[1])

    # Return only list of string
    result_pre = [item[0] for item in result_pre]
    result_post = [item[0] for item in result_post]

    return result_pre, result_post

# Crawling preposition phrase after particullar token
def crawling_after_token_prep_phrase(token, neglect=False):
    result = []
    basis_idx = token.i
    prep = get_all_token_dep(token, dep='prep')
    if prep:
        # If contain children: dep pcomp dep VERB pos tag; Until reach dobj or pobj
        for p in prep:
            prep_idx = p.i
            # If the preposition on the left basis token index, continue
            if basis_idx > prep_idx:
                continue
                
            current = get_token_dep(p, dep=['pcomp', 'dobj', 'pobj'])
            # Store objects
            obj = []
            # Store complement
            comp = [p.text]
            while current:
                text = current.text
                # If current token is object, get the pre-modifier adjective
                if current.dep_ in ['dobj', 'pobj']:
                    pre_adj = ' '.join(extract_pre_amod(current))
                    obj += cross_product_str(pre_adj, text)

                    # Extract conjunct object
                    obj_conj = extract_conj(current, neglect=neglect)
                    if len(obj_conj) > 0:
                        obj += obj_conj
                else:
                    comp = cross_product_str(comp, text)
                    
                current = get_token_dep(current, dep=['pcomp', 'dobj', 'pobj'])

            result += cross_product_str(comp, obj)
            
    return result

def get_sentence_location(mapper, position):
    for s in mapper.keys():
        interval = mapper[s]
        if position >= interval[0] and position < interval[1]:
            return s



In [252]:
# Coreference Resolution

# Generate mapper pronouns-antecedents (subject only)
def get_mapper_pron_ant(doc):
    
    def locate_subject_ant_pron(_doc):
        # Locate potential antecedents and pronouns (subject only)
    
        # Define local variables
        antecedents = []
        pron = []
        prohibit_pronouns = [ 'i', 'you', 'me', 'my', 'mine']
    
        # Get sentence mapper
        sentence_points = {}
        for i, s in enumerate(_doc.sents):
            sentence_points[i] = (s.start, s.end)
        
        for token in _doc:
            # Condition potential antecedents
            # If the token is not pronouns and it's a subject
            if (token.pos_ in ['NOUN', 'PROPN']) and (token.dep_ == 'nsubj'):
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                antecedents.append((token, start, location_sentence))
                # Check is there any conj
                # antecedents += extract_conj(token, only_token=True)
        
            # if (token.pos_ != 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
            #     start = token.i
            #     end = start + 1
            #     location_sentence = get_sentence_location(sentence_points, start)
            #     antecedents.append((token, start, location_sentence))
            #     # Check is there any conj
            #     # antecedents += extract_conj(token, only_token=True)    
        
            # Condition potential pronouns
            # Rule 1
            # If pron is subject (it could be same sentence or previously)
            if (token.pos_ == 'PRON' and token.text.lower() not in prohibit_pronouns) and (token.dep_ == 'nsubj'):
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                pron.append((token, start, location_sentence))
                
            # Rule 2
            # If pron is possesion (ant is subject in the same sentence)
            if (token.pos_ == 'PRON' and token.text.lower() not in prohibit_pronouns) and (token.dep_ == 'poss'):
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                pron.append((token, start, location_sentence))
        
            # Rule 3
            # If pron is object
            # if (token.pos_ == 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
            #     start = token.i
            #     end = start + 1
            #     location_sentence = get_sentence_location(sentence_points, start)
            #     pron.append((token, start, location_sentence))
        
        
        return (antecedents, pron)

    # Filter sentence
    def filter_sentence(_list, location):
        temp = []
        for e in _list:
            if e[-1] == location:
                temp.append(e)
        return temp

    # Define local variable
    mapper = {}
    result = None

    antecedents, pronouns = locate_subject_ant_pron(doc)
    
    if len(pronouns) > 0:
        for p in pronouns:
            # Current status
            is_success = False

            # Get current text, index token, and location sentence token
            token_pron, index_pron, sent_pron = p
            current_sentence = sent_pron
            
            while current_sentence > -1:
                # Get the antecedents
                filter_antecedents = filter_sentence(antecedents, current_sentence)

                # If the filter antecedents exist
                if len(filter_antecedents) > 0:
                    for ant in filter_antecedents:
                        token_ant, index_ant, sent_ant = ant
                        # If antecedent is subject and pronouns is subject or possession and antecedent on the left of pronoun
                        if ('subj' in token_ant.dep_) and ('subj' in token_pron.dep_ or 'poss' in token_pron.dep_) and (index_ant < index_pron):
                            mapper[index_pron] = index_ant
                            is_success = True
                            break
                        # if ('obj' in token_ant.dep_ and 'obj' in token_pron.dep_) and (index_ant < index_pron):
                        #     mapper[index_pron] = index_ant
                        #     is_success = True
                        #     break
                
                # If already success, break it.
                if is_success:
                    break
                    
                current_sentence -= 1

    return mapper

In [335]:
# Main aspect extraction

# Extract all raw aspects
def get_raw_aspects(doc):
    """
        return: list of tuple, tuple: (aspect, start, end)
    """
    # Define global variables
    global bing_liu_opinion_words
    
    # Define local variables
    storage = []

    # Define helper function
    def is_abnormal_noun(text):
        """
            If text only contains special character/number/both OR total length less than 3 it specified as abnormal.
        """
        if re.match(r'^[0-9\W]+$', token.text) or len(token.text) < 3:
            return True
        else:
            return False

    # Going through all token
    for idx, token in enumerate(doc):
        # Make sure the text is not abnormal
        if is_abnormal_noun(token.text):
            continue

        # If the word is noun and preceded by an adjective
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'ADJ'):
            # If the adjective is an opinion
            if doc[idx - 1].text not in bing_liu_opinion_words:
                # Concatenate adj + word then add to storage
                text = doc[idx - 1].text + ' ' + token.text
                storage.append((text, idx - 1, idx + 1))
            else:
                # Else, add noun only
                text = token.text
                storage.append((text, idx, idx + 1))
            continue
            
        # If the word is noun and preceded by another noun
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'NOUN'):
            text = doc[idx - 1].text + ' ' + token.text
            storage.append((text, idx - 1, idx + 1))
            continue

        # If the word is noun and direct object
        if token.pos_ == 'NOUN' and (token.dep_ == 'dobj'):
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # If the word is noun and a subject of sentence
        if token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # If the word is noun and a conj of another noun
        if (token.pos_ == 'NOUN' and token.dep_ == 'conj') and (token.head.pos_ == 'NOUN'):
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # # If the sentence contains SUBJECT VERB, then makes it true
        # if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
        #     is_contain_subject_verb = True
    
        # # If token is word that contain pre-modifier
        # if (token.dep_ == 'amod' and token.head.pos_ == 'NOUN'):
        #     if token.head.i != idx + 1:
        #         continue
        #     text = token.text + ' ' + token.head.text
        #     storage.append((text, idx, token.head.i + 1))
    
        # # If token is word that contain post-modifier
        # if (token.dep_ == 'pobj' and token.pos_ == 'NOUN'):
        #     if token.head.dep_ == 'prep' and token.head.head.pos_ == 'NOUN':
        #         text = token.head.head.text + ' ' + token.head.text + ' ' + token.text
        #         start = token.head.head.i
        #         storage.append((text, start, idx + 1))
            
        
        # If token is adverb modifier and its head is NOUN then store it.
        if (token.dep_ == 'advmod' and token.head.pos_ == 'NOUN'):
            text = token.head.text + ' ' + token.text
            storage.append((text, token.head.i, idx + 1))
            # adv_adj_mod.append((text, idx, idx + 1))

    # Sort storage
    storage = list(set(storage))
    storage = sorted(storage, key=lambda x: (x[1], x[0]))

    return storage

# Prunning raw aspect
def prunning_aspect(list_, doc):
    # Define local variables
    drop_idx = []
    storage = {}
    
    # Get sentence mapper
    sentence_points = {}
    for i, s in enumerate(doc.sents):
        sentence_points[i] = (s.start, s.end)

    for idx, item in enumerate(list_):
        # As long as current idx does not more than maximum list_ index
        if idx != len(list_) - 1:
            # Get the next item
            next_item = list_[idx + 1]
            # If current item start position and next item end position are overlapping
            if item[-1] - 1 == next_item[1]:
                # We merge the text based on last text in current item and first text in next item
                append_text = ' '.join(next_item[0].split()[1:])
                # Update next item values
                new_text = item[0] + ' ' + append_text
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
            
            # If current item start position = next item end position (They are next to each other)
            if item[-1] == next_item[1]:
                # Update the next value (do not have to merge the text based on specific text).
                new_text = item[0] + ' ' + next_item[0]
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
                
    list_ = [list_[i] for i in range(len(list_)) if i not in drop_idx]

    # Create return as aspect-list of sentence mapper
    for i, s in enumerate(list_):
        aspect, start, end = s
        sentence_location = get_sentence_location(sentence_points, start)
        sentence = list(doc.sents)[sentence_location].text
        # Update value and store text as lowercase
        # storage[sentence_location].append(aspect.lower())
        if not storage.get(aspect.lower()):
            storage[aspect.lower()] = [sentence]
        else:
            storage[aspect.lower()].append(sentence)
    return storage

In [254]:
# Utility: Subject active rules (Conjunct Handling)


##==================== CONJUNCT HANDLING ==============================##

def ability_obj_conjunct(obj, base, set_rep=False):
    #====== Conjunct Object =======#
    result = []
    reps =  [] # Representative conjunct storage
    neg = ' '
    conjuncts = get_all_token_conj(obj)
    if len(conjuncts) > 0:
        for conjunct in conjuncts:
            # Get neglection object
            pre_amod_token = get_token_dep_left(conjunct, dep='amod')
            # If the neglection does not appear at front of object, it may refers to the most left pre modifier
            neg = get_neglect(conjunct) or get_neglect(pre_amod_token)
            # Get pre adjectvie modifier of conjunct
            pre_adj = ' '.join(extract_pre_amod(conjunct))
                
            # Concatenate components (compliment) into: aux (optional) + not (optional) + adv (optional) 
            #                                             + verb + aux-comp (optional) + compliment + not (optional)
            # Concatenate components into: aux (optional) + not (optional) + adv (optional) + verb + not (optional)
            # Concatenate components into: base + not (optional)
            ability = cross_product_str(base, neg)
            # Concatenate components (compliment) into: aux (optional) + not (optional) + adv (optional) 
            #                                             + verb + aux-comp (optional) + compliment + not (optional) + adj (optional)
            # Concatenate components into: aux (optional) + not (optional) + adv (optional) + verb + not (optional) + adj (optional)
            # Concatenate components into: base + not (optional) + adj (optional)
            ability = cross_product_str(ability, pre_adj)
            # Concatenate components (compliment) into: aux (optional) + not (optional) + adv (optional) 
            #                                             + verb + aux-comp (optional) + compliment + not (optional) + adj (optional) + Conjunct object
            # Concatenate components into: aux (optional) + not (optional) + adv (optional) + verb + not (optional) + adj (optional) + Conjunct object
            # Concatenate components into: base + not (optional) + adj (optional) + Conjunct object
            ability = cross_product_str(ability, conjunct.text)
        
            # Add the ability into abilities
            # IF with representative
            if set_rep:
                result += cross_product_tuple(ability, [(conjunct.lemma_,)])
            else:
                result += ability
            # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb + adj (optional) + Conjunct object
            # EXPECTED PATTERN (compliment) : aux (optional) + not (optional) + adv (optional) + verb + aux-comp (optional) + compliment 
            #                                   + adj (optional) + Conjunct object
            # Note: Since normaly, If direct object is noun/propn/pron the conjuncts are noun/propn/pron too.
            #        This rule follow this concept. In somehow, the conjunct could be adjective or another verb.
    return result

def ability_adj_conjunct(adj, base, set_rep=False):
    #====== Conjunct Adjective =======#
    result = []
    conjuncts = get_all_token_conj(adj)
    if len(conjuncts) > 0:
        for conjunct in conjuncts:
            # Get neglection adjective
            neg = get_neglect(conjunct)
    
            # Concatenate components into: aux (optional) + not (optional) + adj
            ability = cross_product_str(base, conjunct.text)
    
            # IF with representative
            if set_rep:
                result += cross_product_tuple(ability, [(conjunct.lemma_,)])
            else:
                result += ability
    return result


def ability_adv_conjunct(advmod, base, set_rep=False):
    result = []
    conjuncts = get_all_token_conj(advmod)
    
    if len(conjuncts) > 0:
        for conjunct in conjuncts:
            # Get pre adverb modifier
            pre_adv = get_token_dep_left(conjunct, dep=['advmod', 'npadvmod'])
            if pre_adv:
                # Get pre and post adverb after pre adverb main verb
                pre_advmod_temp, _ = extract_adv(pre_adv)
                pre_advmod_temp = ' '.join(pre_advmod_temp)
            
                pre_adv = [pre_advmod_temp, pre_adv.text]
                pre_adv = (' '.join(pre_adv)).strip()
            else:
                pre_adv = ' '
                
            # Get neglection adjective
            neg = get_neglect(conjunct)
            # Concatenate components into: base + not (optional)
            ability = cross_product_str(base, neg)
            # Concatenate components into: base + not (optional) + pre-adv (optional)
            ability = cross_product_str(ability, pre_adv)
            # Concatenate components into: base + not (optional) + pre-adv (optional) + adv
            ability = cross_product_str(ability, conjunct.text)
    
            # IF with representative
            if set_rep:
                result += cross_product_tuple(ability, [(conjunct.lemma_,)])
            else:
                result += ability
    
    return result

In [255]:
# Utility: subject active rules (Head is verb) components

def get_components_verb(verb):
    # Get main components after verb that generate sentences.
    
    # Get direct object verb token
    obj = get_token_dep(verb, dep='dobj')
    # Get post-modifier adverb token
    advmod = get_token_dep_right(verb, dep=['advmod', 'npadvmod'])
    # Get preposition after verb token
    prep = get_token_dep(verb, dep='prep')
    # Get adjectival compliment
    acomp = get_token_dep(verb, dep='acomp')
    
    return {'obj': obj, 'advmod': advmod, 'prep': prep, 'acomp': acomp}

def base_sentence(main_aux, main_verb, neg):
    
    if (main_verb) or (main_aux):
        # Concatenate components into: not (optional)
        ability = cross_product_str(neg, ' ')
        if main_verb:
            # Get pre advmod if exist
            pre_adv = get_token_dep_left(main_verb, dep=['advmod', 'npadvmod'])
            if pre_adv:
                # Get pre and post adverb after pre adverb main verb
                pre_advmod_temp, post_advmod_temp = extract_adv(pre_adv)
                pre_advmod_temp = ' '.join(pre_advmod_temp)
                post_advmod_temp = ' '.join(post_advmod_temp)
                        
                pre_adv = [pre_advmod_temp, pre_adv.text, post_advmod_temp]
                pre_adv = (' '.join(pre_adv)).strip()
            else:
                pre_adv = ' '
                
            # Concatenate components into: not (optional) + adv (optional)
            ability = cross_product_str(ability, pre_adv)
            # Concatenate components into: not (optional) + adv (optional) + verb
            ability = cross_product_str(ability, main_verb.text)
        # If auxiliary token exist
        if main_aux:
            # Concatenate components into: aux (optional) + not (optional) + adv (optional) + verb
            ability = cross_product_str(main_aux.text, ability)
        return ability
    return [' ']


def ability_advmod(advmod, base):
    result = []

    pre_adv = get_token_dep_left(advmod, dep=['advmod', 'npadvmod'])
    if pre_adv:
        # Get pre and post adverb after pre adverb main verb
        pre_advmod_temp, _ = extract_adv(pre_adv)
        pre_advmod_temp = ' '.join(pre_advmod_temp)
    
        pre_adv = [pre_advmod_temp, pre_adv.text]
        pre_adv = (' '.join(pre_adv)).strip()
    else:
        pre_adv = ' '
    
    # Concatenate components: pre-adv (optional) + adv
    ability = cross_product_str(pre_adv, advmod.text)
        
    # Concatenate components into (if custom base exist): base + pre-adv (optional) + adv
    ability = cross_product_str(base, ability)
    
    # Get preposition after adverb
    prep_after_advmod = crawling_after_token_prep_phrase(advmod)
    # If preposition after adverb exist
    if prep_after_advmod:
        # Concatenate components into (if custom base exist): base + pre-adv (optional) + adv + preposition phrase (optional)
        ability = cross_product_str(ability, prep_after_advmod)

    result += ability
    
    # Conjunct adverb handling
    result += ability_adv_conjunct(advmod, base, set_rep=False)
    
    # EXPECTED PATTERN (if custom base exist): base + pre-adv (optional) + adv + preposition phrase (optional)
    return result

def ability_dobj(obj, base):
    result = []

    # Get neglection direct object
    pre_amod_token = get_token_dep_left(obj, dep='amod')
    # If the neglection does not appear at front of object, it may refers to the most left pre modifier
    neg = get_neglect(obj) or get_neglect(pre_amod_token)
    
    # Get pre adjectvie modifier of object
    pre_adj = ' '.join(extract_pre_amod(obj))
    
    # Concatenate components: adj (optional) + Direct object
    ability = cross_product_str(pre_adj, obj.text)

    # Concatenate components: not (optional) + adj (optional) + Direct object
    ability = cross_product_str(neg, ability)

    # Concatenate components into: base + not (optional) + adj (optional) + Direct object
    ability = cross_product_str(base, ability)

    # GET PREPOSITION AFTER OBJ and ADNOMINAL CLAUSE
    prep = get_token_dep(obj, dep='prep')
    acl = get_token_dep(obj, dep='acl')
    if (prep) or (acl):
        if prep:
            # Get phrase: preposition + preposition-compliment (optional) + pre-adj (optional) + object

            phrase = ability_prep(prep)        
            # EXPECTED PATTERN (compliment): base + not (optional) + adj (optional) + Direct object + preposition + preposition-compliment (optional) 
            #                                  not (optional) + pre-adj (optional) + object
            result += cross_product_str(ability, phrase)
        if acl:
            # Get aux acl
            aux_acl = get_token_dep(acl, dep='aux')
            # Neglection acl
            neg = get_neglect(acl)
            # Define base Adnominal Clause
            # Concatenate components into (if custom base exist): base + not (optional) + adj (optional) + Direct object + not (optional)
            temp = cross_product_str(ability, neg)
            if aux_acl:
                # Concatenate components into (if custom base exist): base + not (optional) + adj (optional) + Direct object + not (optional) + aux-acl
                temp = cross_product_str(temp, aux_acl.text)
                
            # Concatenate components into (if custom base exist): base + not (optional) + adj (optional) + Direct object + not (optional) + aux-acl
            #                                                       + acl
            temp = cross_product_str(temp, acl.text)
            # EXPECTED PATTERN: base + not (optional) + adj (optional) + Direct object + not (optional) + aux-acl + acl + all possible option
            result += ability_adnominal_clause(acl=acl, base=temp)
    else:
        # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb + adj (optional) + not (optional) + Direct object
        # EXPECTED PATTERN (compliment): aux (optional) + not (optional) + adv (optional) + verb + aux-comp (optional)
        #                                  + compliment + adj (optional) + not (optional) + Direct object
        result += ability
    
    # Conjunct object handling
    result += ability_obj_conjunct(obj, base=base)
    
    return result

def ability_prep(prep, base=None):
    # Get object of preposition
    obj = get_token_dep_right(prep, dep=['dobj', 'pobj'])
    pcomp = None
    # If object does not exist
    if not obj:
        # Get the preposition complement
        pcomp = get_token_dep_right(prep, dep='pcomp')
        if pcomp:
            # Get the object that refers to preposition complement
            obj = get_token_dep_right(pcomp, dep=['dobj', 'pobj'])
    
    # Concatenate components: preposition
    ability = cross_product_str(prep.text, ' ')
    if (pcomp) or (obj):
        # If preposition compliment exist
        if pcomp:
            # Concatenate components: preposition + preposition-compliment (optional)
            ability = cross_product_str(ability, pcomp.text)
        # If object exist
        if obj:
            # Get neglection object
            pre_amod_token = get_token_dep_left(obj, dep='amod')
            # If the neglection does not appear at front of object, it may refers to the most left pre modifier
            neg = get_neglect(obj) or get_neglect(pre_amod_token)
            # Temporary storage
            temp = []
            # Get pre adjective modifier object
            pre_adj = ' '.join(extract_pre_amod(obj))
            # Concatenate components: preposition + preposition-compliment (optional) + not (optional)
            temporary = cross_product_str(ability, neg)
            # Concatenate components: preposition + preposition-compliment (optional) + not (optional) + pre-adj (optional)
            temporary = cross_product_str(temporary, pre_adj)
            # Concatenate components: preposition + preposition-compliment (optional) + not (optional) + pre-adj (optional) + object
            temp += cross_product_str(temporary, obj.text)
            
            # Conjunct object handling
            temp += ability_obj_conjunct(obj, base=ability)
            
            ability = temp

        # If base is None, return phrase only
        if not base:
            # EXPECTED PATTERN: preposition + preposition-compliment (optional) + pre-adj (optional) + object
            return ability
        else:
            # EXPECTED PATTERN: base + preposition + preposition-compliment (optional) + pre-adj (optional) + object
            ability = cross_product_str(base, ability)
            return ability
            
    return []


def ability_acomp(acomp, base):
    # Get neglection 
    neg = get_neglect(acomp)
    # if not base:
    #     if comp:
    #         # Get base sentence: aux (optional) + not (optional) + adv (optional) + verb + aux-comp (optional) + compliment
    #         base = base_sentence_comp(comp, **kwargs)  
    #     else:
    #         # Get base sentence: aux (optional) + not (optional) + adv (optional) + verb
    #         base = base_sentence(main_aux=kwargs.get('main_aux'),
    #                                 main_verb=kwargs.get('main_verb'),
    #                                 neg=kwargs.get('neg'))

    # Concatenate components into: not (optional) + acomp
    ability = cross_product_str(neg, acomp.text)
    # Concatenate components into: aux (optional) + not (optional) + adv (optional) + verb + not (optional) + acomp
    # Concatenate components into (compliment): aux (optional) + not (optional) + adv (optional) + verb + aux-comp (optional) 
    #                                             + compliment + not (optional) + acomp
    # Concatenate components into (if custom base exist): base + not (optional) + acomp
    ability = cross_product_str(base, ability)                    
    # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb + adv + preposition phrase (optional)
    # EXPECTED PATTERN (compliment): aux (optional) + not (optional) + adv (optional) + verb + aux-comp (optional)
    #                                             + compliment + adv + preposition phrase (optional)
    # EXPECTED PATTERN (if custom base exist): base + adv + preposition phrase (optional)
    return ability


def ability_adnominal_clause(acl, base):
    # Define local variable
    result = []
    
    # Get intransitive rate score
    int_rate_acl = map_verb_intrans.get(acl.text) or map_verb_intrans.get(acl.lemma_)
    # If the verb is not in the mapper ( we assume it is transitive verb )
    if not int_rate_acl:
        int_rate_acl = 0

    
    temp = ability_relative_verb(acl, base=base)
    if len(temp) > 0:
        # result += cross_product_tuple(temp, [tuple(reps)])
        result += temp
    else:
        # If do not contain any of that, but intransitive verb ==> Subject + aux (optional) + not (optional) + adv (optional) + verb
        if int_rate_acl > 0.5 and comp.lemma_.lower() not in ['be', 'do', 'have']:
            ability = cross_product_str(base, aux_acl.text)
            ability = cross_product_str(ability, acl.text)
            # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb
            result += ability
    
    return result

In [256]:
# Utility: Case Head is verb

def ability_relative_verb(verb, base):
    # Get ability that relative to particlar verb
    result = []
    comp = None

        
    # Extract all token
    obj, advmod, prep, acomp = get_components_verb(verb).values()
    if (advmod) or (prep) or (obj) or (acomp):
        # If adverb after verb exist
        if advmod:
            ability = ability_advmod(advmod, base=base)
            # EXPECTED PATTERN: Subject + aux (optional) + not (optional) + adv (optional) + verb + adv + prepositional phrase (optional)
            result += ability
    
        # If prep after verb
        if prep:
            ability = ability_prep(prep, base=base)
            # EXPECTED PATTERN: Subject + aux (optional) + not (optional) + adv (optional) + verb + preposition phrase
            result += ability
    
        # If direct object exist
        if obj:
            ability = ability_dobj(obj, base=base)
            result += ability
    
        # If adjective compliment exist
        if acomp:
            ability = ability_acomp(acomp, base=base)
            result += ability
    
    return result

def ability_verb(main_verb):
    result = []
    reps = [main_verb.lemma_] # Representative ability.
    ###============ DEFINE VARIABLES ================###
    # Get main auxiliary
    aux = get_token_dep(main_verb, dep='aux')
    # Get neglect; If there is no neglect, return empty text.
    neg = get_neglect(main_verb)
    # Get intransitive rate score
    int_rate = map_verb_intrans.get(main_verb.text) or map_verb_intrans.get(main_verb.lemma_)
    # If the verb is not in the mapper ( we assume it is transitive verb )
    if not int_rate:
        int_rate = 0

    #==================== COMPLIMENT ========================#
    # Get compliment verb
    comp = get_token_dep(main_verb, dep=['xcomp', 'ccomp'])
    if comp and (comp.pos_ not in ['VERB', 'AUX'] or get_token_dep(comp, dep='auxpass')):
        comp = None
        
    # NOTE: a single verb to directly have both a ccomp and an xcomp dependency simultaneously 
    #         is rare and typically wouldn't occur. If a verb does have two clausal complements, 
    #         each clause would serve a different function or role in the sentence.
    int_rate_comp = map_verb_intrans.get(main_verb.text) or map_verb_intrans.get(main_verb.lemma_)
    if not int_rate_comp:
        int_rate_comp = 0
        
    ###===================== CONDITION =====================###
    components = {'main_aux': aux, 'main_verb': main_verb, 'neg': neg, }
    base = base_sentence(**components)
    
    temp = ability_relative_verb(main_verb, base=base)
    result += cross_product_tuple(temp, [tuple(reps)])
    # result += ability_relative_verb(main_verb, **components)
    if comp:
        # Add compliment verb as representative
        reps.append(comp.lemma_)
        # Add auxiliary compliment into components
        aux_comp = get_token_dep(comp, dep='aux')
        # Update base
        if aux_comp:
            base = cross_product_str(base, aux_comp.text)
        base = cross_product_str(base, comp.text)
            
        temp = ability_relative_verb(comp, base=base)
        # temp = ability_relative_verb(comp, base=base, is_comp=True, **components)
        if len(temp) > 0:
            result += cross_product_tuple(temp, [tuple(reps)])
        else:
            # If do not contain any of that, but intransitive verb ==> Subject + aux (optional) + not (optional) + adv (optional) + verb
            if int_rate_comp > 0.5 and comp.lemma_.lower() not in ['be', 'do', 'have']:
                # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb
                result += cross_product_tuple(base, [tuple(reps)])       

    if len(result) == 0:
        # If do not contain any of that, but intransitive verb ==> Subject + aux (optional) + not (optional) + adv (optional) + verb
        if int_rate > 0.5 and main_verb.lemma_.lower() not in ['be', 'do', 'have']:
            ability = base_sentence(**components)
            # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb
            result += cross_product_tuple(ability, [tuple(reps)])

    # Labeling VERB
    result = cross_product_flatten_append('VERB', result)
    return result

In [257]:
###============ AUXILIARY ================###

def ability_aux(aux):
    # Define local variables
    result = []
    # Get neglect of auxiliary
    neg = get_neglect(aux)
    components = {'main_aux': aux, 'neg': neg, 'main_verb': None}
    base = base_sentence(**components)
    
    # Get the component tokens
    
    # Get adjective token
    # NOTE: if 'AUX' is root, only have one adjective with dependency acomp.
    adj = get_token_dep(aux, dep='acomp')
    
    # Get noun token
    noun = get_token_pos_right(aux, pos=['NOUN', 'PROPN'])
    if noun and noun.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        noun = None
        
    # Get prepositinal token
    prep = get_token_dep(aux, dep='prep')

    ### CONDITIONAL TOKEN ###
    if adj:
        temp = ability_aux_adj(adj, base=base)
        ability = cross_product_flatten_append('ADJ', temp)
        # EXPECTED PATTERN: Subject + aux + not (optional) + adj
        result += ability

    if noun:
        temp = ability_aux_noun(noun, base=base)
        ability = cross_product_flatten_append('OTHER', temp)
        # EXPECTED PATTERN: Subject + aux + not (optional) + pre-modifier adjective (optional) + noun
        result += ability

    if prep:
        temp = ability_aux_prep(prep, base=base)
        ability = cross_product_flatten_append('OTHER', temp)
        # EXPECTED PATTERN: Subject + aux + not (optional) + phrase
        result += ability

    return result

def ability_aux_adj(adj, base):
    result = []
    reps = [adj.lemma_] # Representative ability.

    # GET PREPOSITION AFTER ADJ
    prep = get_token_dep(adj, dep='prep')
    # Concatenate components into: base + adj
    ability = cross_product_str(base, adj.text)
    if prep:
        phrase = ability_prep(prep)
        # Concatenate components into: base + adj + preposition phrase (optional)
        ability = cross_product_str(ability, phrase)
        
    # Concatenate components into: base + adj + preposition phrase (optional)
    result += cross_product_tuple(ability, [tuple(reps)])

    # GET CONJUNCT
    temp = ability_adj_conjunct(adj, base=base, set_rep=True)
    if len(temp) > 0:
        result += temp
    return result

def ability_aux_noun(noun, base):
    result = []
    reps = [noun.lemma_] # Representative ability
    
    
    # Get pre-modifier adjective of noun
    pre_adj = ' '.join(extract_pre_amod(noun))
    # Concatenate components into: base + pre-modifier adjective (optional)
    ability = cross_product_str(base, pre_adj)
    # Concatenate components into: base + pre-modifier adjective (optional) + noun
    ability = cross_product_str(base, noun.text)

    result += cross_product_tuple(ability, [tuple(reps)])
    return result

def ability_aux_prep(prep, base):
    result = []
    
    # Get preposition phrase
    phrase = ability_prep(prep)
    
    # Conncatenate components into: base + Prepositional phrase
    ability = cross_product_str(base, phrase)
    # NOTE: It included preposition object conjuncts.

    # Extract representative word
    for a in ability:
        temp = a.split()
        temp = cross_product_tuple(a, [(temp[-1],)])
        result += temp
        
    return result

In [258]:
def ability_pass_agent(agent, base):
    result = []
    # Get object agent
    obj_agent = get_token_dep(agent, dep=['pobj', 'dobj'])

    if obj_agent:   
        # Update Base
        # Concatenate components: aux (optional) + neg (optional) + auxpass + verb + agent
        base = cross_product_str(base, agent.text)

        # Get neglection object
        pre_amod_token = get_token_dep_left(obj_agent, dep='amod')
        # If the neglection does not appear at front of object, it may refers to the most left pre modifier
        neg = get_neglect(obj_agent) or get_neglect(pre_amod_token)
        # Get pre adjectvie modifier of conjunct
        pre_adj = ' '.join(extract_pre_amod(obj_agent))

        # Concatenate components: aux (optional) + neg (optional) + auxpass + verb + agent + adj (optional)
        ability = cross_product_str(base, pre_adj)
        # Concatenate components: aux (optional) + neg (optional) + auxpass + verb + agent + adj (optional) + object
        ability = cross_product_str(base, obj_agent.text)
        # EXPECTED PATTERN: aux (optional) + neg (optional) + auxpass + verb + agent + adj (optional) + object
        result += ability

        # Handling object conjuncts
        result += ability_obj_conjunct(obj_agent, base=base, set_rep=False)
        

    return result

In [259]:
# Utility: Helper subject active rules

def is_contain_question(token):
    questions = ['what', 'who', 'why', 'whom', 'when', 'which', 'where', 'whose', 'how']
    tokens = get_all_token_dep(token, dep=['advmod', 'attr'])
    for t in tokens:
        if t.text.lower() in questions:
            return True
    return False

def is_perfect_sentence(sent):
    for token in sent:
        if token.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
            return True
    return False

def imperfect_sentence_rules(token):
    properties = []
    result = []
    if token.head.text == token.text:
        # Get compound or amod
        properties = get_all_token_dep_left(token, dep=['compound', 'amod'])
    
    if len(properties) > 0:
        for p in properties:
            temp = cross_product_str('be', p.text.lower())
            if p.pos_ == 'ADJ':
                label = 'ADJ'
            else:
                label = 'OTHER'
            temp = cross_product_tuple(temp, [(p.lemma_,)])
            result += cross_product_flatten_append(label, temp)
            # result += cross_product_tuple(temp, [(p.lemma_,)])
        # properties = cross_product_tuple(temp, [tuple(properties)])
        return result
        
    return []

In [260]:
def subject_active_rules(token, subject):
    abilities = []
    # Go to its head
    head = token.head

    # If head is Verb and it is root
    # if (head.pos_ == 'VERB') and ((head.head.text == head.text) or head.dep_ == 'conj'):
    if (head.pos_ == 'VERB'):
        any_question = is_contain_question(head)
        if any_question:
            return abilities
        
        elif (head.head.text == head.text):
            verb_conjunct = [head]
    
            verb_conjunct += get_all_token_conj(head)
            ###============ GET ALL TOKENS ================###
            for verb in verb_conjunct:
                compare = get_token_dep(verb, dep=['nsubj', 'nsubjpass'])
                if not compare or (compare.text == subject.text):
                    abilities += ability_verb(verb)

        elif (head.pos_ == 'VERB') and (head.dep_ in ['conj', 'advcl']):
            compare = get_token_dep(head, dep=['nsubj'])
            if compare and (subject.text == compare.text):
                verb_conjunct = [head]
                if head.dep_ == 'advcl':
                    verb_conjunct += get_all_token_conj(head)
                for verb in verb_conjunct:
                    abilities += ability_verb(verb)
        
    # If head is aux
    elif head.pos_ == 'AUX':
        abilities += ability_aux(head)           
    return abilities

In [261]:
def base_sentence_passive(main_auxpass, main_verb, main_aux, neg):
    # Get pre advmod if exist
    pre_adv = get_token_dep_left(main_verb, dep=['advmod', 'npadvmod'])
    if pre_adv:
        # Get pre and post adverb after pre adverb main verb
        pre_advmod_temp, post_advmod_temp = extract_adv(pre_adv)
        pre_advmod_temp = ' '.join(pre_advmod_temp)
        post_advmod_temp = ' '.join(post_advmod_temp)
                        
        pre_adv = [pre_advmod_temp, pre_adv.text, post_advmod_temp]
        pre_adv = (' '.join(pre_adv)).strip()
    else:
        pre_adv = ' '
                    
    # If aux exist
    if main_aux:
        # Concatenate components: aux (optional) + neg (optional)
        ability = cross_product_str(main_aux.text, neg)
        # Concatenate components: aux (optional) + neg (optional) + auxpass
        ability = cross_product_str(ability, main_auxpass.text)
    else:
        # Concatenate components: auxpass + neg (optional)
        ability = cross_product_str(main_auxpass.text, neg)

    # Concatenate components: aux (optional) + neg (optional) + auxpass + adv (optional)
    ability = cross_product_str(ability, pre_adv)
    
    # Concatenate components: aux (optional) + neg (optional) + auxpass + adv (optional) + verb
    ability = cross_product_str(ability, main_verb.text)

    return ability    

In [262]:
def subject_passive_rules(token):
    abilities = []
    result = []
    reps = [token.head.lemma_]
    # Get the token head (verb). Since passive form at least form: Subject + auxpass + verb 
    head = token.head
    if head.pos_ != 'VERB':
        return []

    ##================= GET ALL POSSIBLE COMPONENTS ===============##
    # 1. Get Possible Tokens (on Left Side) #
    # Get neglect; If there is no neglect, return empty text.
    neg = get_neglect(head)

    # Get aux token
    aux = get_token_dep(head, dep='aux')
    # Get auxpass token
    auxpass = get_token_dep(head, dep='auxpass')
    if not auxpass:
        # Since passive sentence must have auxpass in its component.
        return []

    # 2. Get Possible Tokens (on Right Side) #
    # Get the agent token
    agent = get_token_dep_right(head, dep='agent')
    
    # Get advmod after verb token
    advmod = get_token_dep_right(head, dep=['advmod', 'npadvmod'])

    # Get prepositional phrase
    prep = get_token_dep(head, dep='prep')

    # Get xcomp token
    comp = get_token_dep(head, dep=['xcomp', 'ccomp'])

    ##================= STORING ABILITIES ===============##    
    components = {'main_auxpass': auxpass , 'main_verb': head, 'main_aux': aux, 'neg': neg}
    base = base_sentence_passive(main_auxpass=components.get('main_auxpass'), 
                                 main_verb=components.get('main_verb'), 
                                 main_aux=components.get('main_aux'), 
                                 neg=components.get('neg'))
    
    # Store ability: If adverb modifier exist
    if advmod:
        # EXPECTED PATTERN: aux (optional) + neg (optional) + auxpass + verb + object
        abilities += ability_advmod(advmod, base=base)

    # Store ability: If agent and object agent token exist
    if agent:
        # EXPECTED PATTERN: aux (optional) + neg (optional) + auxpass + verb + object
        abilities += ability_pass_agent(agent, base=base)

    # Store ability: If preposition after verb exist
    if prep:
        # EXPECTED PATTERN: aux (optional) + neg (optional) + auxpass + verb + preposition phrase
        abilities += ability_prep(prep, base=base)

    # Store into result storage
    if len(abilities) > 0:
        result += cross_product_tuple(abilities, [tuple(reps)])

    # Store ability: If xcomp exist
    if comp:
        # Add compliment verb as representative
        reps.append(comp.lemma_)
        # Add auxiliary compliment into components
        aux_comp = get_token_dep(comp, dep='aux')
        # Update base
        if aux_comp:
            base = cross_product_str(base, aux_comp.text)
        base = cross_product_str(base, comp.text)
            
        temp = ability_relative_verb(comp, base=base)
        if len(temp) > 0:
            result += cross_product_tuple(temp, [tuple(reps)])
        else:
            # If do not contain any of that, but intransitive verb ==> Subject + aux (optional) + not (optional) + adv (optional) + verb
            if int_rate_comp > 0.5 and comp.lemma_.lower() not in ['be', 'do', 'have']:
                # EXPECTED PATTERN: aux (optional) + not (optional) + adv (optional) + verb
                result += cross_product_tuple(base, [tuple(reps)]) 

    # Labeling VERB
    result = cross_product_flatten_append('VERB', result)
    # return abilities
    return result

In [263]:
def get_raw_abilities(doc, ant_first_pron='the user'):
    # Define local variable.
    storage = {}
    first_person_pronouns = [ 'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    pronouns = [
    "he", "she", "they", "it", # Personal Pronouns (Subjective)
    "him", "her", "them", "it", "you",  # Personal Pronouns (Objective)
    "his", "hers", "theirs", "its", "mine", "yours", "ours",  # Possessive Pronouns
    "her", "their", "its",  # Possessive Adjectives
    "himself", "herself", "themself", "themselves", "Itself",  # Reflexive Pronouns,
    "this", "that", "these", "those", # Demonstrative Pronouns
    "who", "whom", "whose", "which", "that"  # Relative Pronouns
    ]


    # Get sentence mapper, prepare storage, and type of sentence
    sentence_points = {}
    type_sentence = {}
    for i, s in enumerate(doc.sents):
        sentence_points[i] = (s.start, s.end)
        storage[i] = []
        
        if is_perfect_sentence(s):
            type_sentence[i] = 'perfect'
        else:
            type_sentence[i] = 'imperfect'

    # Get mapper pronoun and antecedents
    mapper_pron_ant = get_mapper_pron_ant(doc)

    # Define local variable.
    result = []
    for idx, token in enumerate(doc):
        abilities = []

        ## ==================== SUBJECT ACTIVE SENTENCE =========================== ##
        # If token is subject (should be nsubj and nsubjpass). This time only nsubj
        # In case active sentence form
        if token.dep_ == 'nsubj':
            abilities += subject_active_rules(token, subject=token)

        ## ==================== SUBJECT PASSIVE SENTENCE =========================== ##
        # If sentence is passive form.
        if token.dep_ == 'nsubjpass':
            abilities += subject_passive_rules(token)

        ## ==================== IF SENTENCE IS IMPERFECT ========================== ##
        sentence_location = get_sentence_location(sentence_points, idx)
        if (type_sentence[sentence_location] == 'imperfect') and token.pos_ in ['NOUN', 'PROPN', 'PRON']:
            abilities += imperfect_sentence_rules(token)
            
                
        # Store final result
        if len(abilities) > 0:
            # Subject handling
            subject = token.lemma_
            # subjects = [token] + get_all_token_conj(token)
            # for subject in subjects:
            #     # Get posession
            #     temp = get_token_dep_left(token, dep='poss')
            #     if temp:
            #         subjects = cross_product_str((temp.text + "'s") if temp.pos_ == 'PROPN' else temp.text, subject.text)
                
            # current_idx = token.i
            # If the subject is pronouns and first person pronouns
            # if token.pos_ == 'PRON' and token.text.lower() in first_person_pronouns:
            #     subject = ant_first_pron
            # # If subject is pronouns and its token location in mapper_pron_ant
            # elif token.pos_ == 'PRON' and idx in mapper_pron_ant.keys():
            #     # Get the antecedent index location
            #     idx_map = mapper_pron_ant[idx]
            #     # Change current token subject
            #     token = doc[idx_map]
            #     subject = token.lemma_
            # # If the current child is pronoun (but not in mapper_pron_ant keys)
            # elif token.pos_ == 'PRON' and token.text.lower() in pronouns:
            #     continue
            # # If token only contains special characters or numbers, or length text less than 3 (NOT PRONOUNS)
            # elif (re.match(r'^[0-9\W]+$', token.text)) or (len(token.text) < 3):
            #     continue
                
            # Get all conj subject + current subject
            subjects = [subject] + extract_conj(token)
            # # Store result
            # result += cross_product_tuple(subjects, abilities)
            # Storage final result
            sentence_location = get_sentence_location(sentence_points, idx)
            # storage[sentence_location] += cross_product_tuple(subjects, abilities)

            # THIS IS NEW OUTPUT STORAGE SHOULD BE: UNCOMMENT AFTER SUBJECT PASSIVE AND INPERFECT RULES ALREADY ADJUSTED
            storage[sentence_location] += cross_product_flatten(subjects, abilities)

    # Storing final result
    # Make storage unique only
    if storage:
        for key, value in storage.items():
            storage[key] = list(set(value))
    return storage

In [356]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import os
import json
from pprint import pprint

import gensim
from gensim import corpora
from gensim.models import TfidfModel

import spacy

In [357]:
# Load Bing Liu's opinion word dictionary
bing_liu_opinion_words = set()  # Add the actual list of opinion words here

# Function to load opinion words from Bing Liu lexicon
def load_opinion_words(filepath):
    global bing_liu_opinion_words
    temp = pd.read_table(filepath, comment=';', header=None)[0].to_list()
    bing_liu_opinion_words = bing_liu_opinion_words.union(set(temp))


# Load opinion words
current_dir = os.getcwd()
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/negative-words.txt'))
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/positive-words.txt'))

In [358]:
# Load intransitive rate verb

data_verb = pd.read_csv('verb_transitivity.tsv', sep='\t')

map_verb_intrans = data_verb[['verb', 'percent_intrans']].set_index('verb').to_dict()['percent_intrans']

In [359]:
# Load nlp model
nlp = spacy.load('en_core_web_lg')

In [360]:
# Load data

positive_reviews = [
    "The staff were incredibly helpful and patient, helping me find the perfect phone!",
    "I had a great experience purchasing my phone here, the process was smooth and quick.",
    "Their selection of phones is amazing, and the prices are very competitive!",
    "I appreciate how the staff walked me through setting up my new device.",
    "Great customer service, I left with the phone I wanted and all my questions answered.",
    "They offer amazing deals on phones, I couldn’t resist upgrading.",
    "The technician fixed my phone’s issue faster than I expected. Highly recommend!",
    "Fantastic experience, the staff really know their stuff!",
    "I found the perfect phone case here, and the variety was impressive.",
    "Upgrading my phone was a breeze thanks to their professional service.",
    "Staff was knowledgeable and made sure I knew everything about my new phone.",
    "Prices were reasonable and the staff very courteous!",
    "Very happy with my purchase, the staff really went the extra mile.",
    "Excellent service! They helped me find exactly what I was looking for.",
    "Great deals on accessories, and the staff was super friendly!",
    "I love this store! Always a smooth experience buying or fixing my phone.",
    "I got a really good trade-in deal on my old phone.",
    "Their repair services are quick and reliable.",
    "The staff was extremely helpful in setting up my phone and transferring all my data.",
    "Very professional and friendly service, I’m super satisfied!",
    "Great variety of phones, and the staff was very patient with my questions.",
    "The process was super simple, and I’m thrilled with my new phone.",
    "They helped me choose a phone within my budget, which I really appreciated.",
    "My phone was fixed in less than 30 minutes, such fast service!",
    "I’m a loyal customer because their customer service is always outstanding.",
    "Best phone store in town, hands down!",
    "The staff made sure I was completely comfortable with my purchase.",
    "I found exactly what I needed, and they helped me get a great deal.",
    "This store has a fantastic warranty service!",
    "The staff was very informative, I learned a lot about phone features I didn’t know about.",
    "Excellent store for buying phone accessories, so much variety!",
    "The phone I bought here is working perfectly, couldn’t be happier.",
    "They were super quick in setting up my phone, I was out of there in no time.",
    "Always come here for upgrades, they never disappoint!",
    "The store layout is easy to navigate and staff are always ready to help.",
    "Best pricing for phone plans, they helped me save a lot!",
    "I’ve been to many phone stores, but this one by far provides the best service.",
    "Customer service here is top-notch, they always resolve my issues quickly.",
    "I always recommend this store to friends and family, they never fail to impress.",
    "The staff took the time to show me all my options, no pressure sales.",
    "Amazing place to buy the latest phones at great prices!",
    "Their warranty plan is worth every penny, such a relief!",
    "I appreciate how they were able to fix my phone on the same day.",
    "Got a great deal on my new phone and an awesome case as well!",
    "The staff was very accommodating when I had questions about phone features.",
    "I had a great experience with their trade-in program.",
    "Service was quick and efficient, I was in and out within 15 minutes!",
    "They even helped me transfer all my contacts and data without extra charge.",
    "My phone has been working flawlessly since I bought it from here.",
    "They fixed my screen perfectly and even gave me a discount on the repair.",
    "This is my go-to store for any phone issues, always reliable.",
    "They offer fantastic promotions and discounts!",
    "Great phone selection and even better customer service.",
    "They resolved my issue very quickly and professionally.",
    "I love how organized the store is and how fast they attend to customers.",
    "Highly recommend this store if you’re looking for good deals on phones!",
    "I always leave this store feeling like I made the right purchase.",
    "I received excellent advice from the sales team, they really know their products.",
    "Very happy with the repair service here, my phone looks brand new!"
]


negative_reviews = [
    "I had to wait over an hour to be helped, and the staff wasn’t apologetic at all.",
    "Bought a phone here that stopped working within a week, very disappointing.",
    "Their prices are too high, and the selection is limited.",
    "Customer service is poor, no one seemed interested in helping me.",
    "I had a terrible experience, the phone they sold me was defective.",
    "The staff was rude and unhelpful, I’m never coming back.",
    "They charged me extra for services I didn’t need, felt like a scam.",
    "Phone repairs took way too long, I had to come back multiple times.",
    "I bought a phone, but they didn’t inform me of all the hidden fees.",
    "Staff seemed untrained and gave me incorrect information about the phone plan.",
    "Their warranty is useless, they refused to fix my phone under it.",
    "I had to return a faulty phone twice before they finally gave me a refund.",
    "Very disorganized, I waited forever just to get a simple issue resolved.",
    "The phone I purchased here was overpriced compared to other stores.",
    "They refused to honor the promotion I came in for, very misleading.",
    "I felt pressured to buy accessories I didn’t need.",
    "The repair was done poorly, and my phone broke again within a week.",
    "Customer service was extremely slow, they need to hire more staff.",
    "They didn’t even check if my phone was working after the repair.",
    "Terrible experience, my phone still has the same issue after getting it 'fixed'.",
    "They upsold me on a phone plan I didn’t need, very deceptive.",
    "The staff was unprofessional and seemed like they didn’t want to be there.",
    "Their return policy is awful, I couldn’t exchange my phone despite its defects.",
    "They didn’t apply the discount I was promised.",
    "The store was messy and understaffed.",
    "My phone broke down just after the warranty expired, very frustrating.",
    "They kept trying to sell me more expensive phones when I clearly stated my budget.",
    "The repair job was incomplete, and they refused to refund me.",
    "Their customer service representatives were extremely rude on the phone.",
    "I had to call multiple times just to get a response, very unprofessional.",
    "They didn’t explain anything clearly and rushed me through the purchase.",
    "I regret buying from here, their post-purchase support is non-existent.",
    "Phone stopped working just outside the return window, terrible quality.",
    "The store was chaotic, with long lines and unhelpful staff.",
    "They didn’t even have the phone I wanted in stock after promising me it was available.",
    "Terrible follow-up, they lost my repair order, and I had to start over.",
    "I felt overcharged for a simple screen repair.",
    "Bought a refurbished phone that had several issues they didn’t disclose.",
    "The technician damaged my phone during the repair, and they didn’t take responsibility.",
    "I’m extremely disappointed, will not be coming back here again."
]


df = pd.DataFrame({'review': positive_reviews + negative_reviews})
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  99 non-null     object
dtypes: object(1)
memory usage: 924.0+ bytes
None


Unnamed: 0,review
0,"The staff were incredibly helpful and patient,..."
1,I had a great experience purchasing my phone h...
2,"Their selection of phones is amazing, and the ..."
3,I appreciate how the staff walked me through s...
4,"Great customer service, I left with the phone ..."


In [361]:
# TFIDF
from nltk.corpus import stopwords

def get_words(corpus, thres_tfidf=75, thres_idf=25):
    # Define local variables
    storage_idf = set()
    # storage_tfidf = set()
    storage_tfidf = {}

    # Define the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Preprocessing text
    def preprocessing(text):
        text = remove_extra_spaces(text)
        text = expand_contractions(text)
        text = remove_non_ascii(text)

        # Get token of words
        doc = nlp(text)
        result = []
        for token in doc:
            t = token.lemma_.lower()

            # If only contains special characters or numbers and length less than 3
            if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
                continue
            else:
                result.append(t)
        return result

    ##========= GENERATE MODEL =========##
    # Create texts
    texts = [preprocessing(document) for document in corpus]

    # Create dictionary
    dictionary = corpora.Dictionary(texts)

    # Convert documents into Bag-of-words format
    corpus_bow = [dictionary.doc2bow(text) for text in texts]

    # Train the TF-IDF model
    tfidf_model = gensim.models.TfidfModel(corpus_bow)

    ##============ EXTRACT IMPORTANT VALUES =========##
    # Get the idf values
    idf_values = tfidf_model.idfs # Return (word_id: idf_values)
    scores_idf = np.array(list(idf_values.values()))
    
    idf_dict = {}
    for id, value in idf_values.items():
        word = dictionary[id]
        idf_dict[word] = value
        

    # Apply the model to the corpus (get corpus tfidf)
    corpus_tfidf = tfidf_model[corpus_bow]

    # Get dictionary of tfidf values and scores
    scores_tfidf = []
    tfidf_dict = {}
    for doc_idx, doc in enumerate(corpus_tfidf):

        dict_doc = {}
        for word_id, score in doc:
            word = dictionary[word_id]
            dict_doc[word] = score
            scores_tfidf.append(score)

        tfidf_dict[doc_idx] = dict_doc
    
    ##=========== Get the threshold =========##
    threshold_idf = np.percentile(scores_idf, thres_idf)
    threshold_tfidf = np.percentile(scores_tfidf, thres_tfidf)


    ##========== Get Words =============##
    # IDF
    for key, value in idf_dict.items():
        if value <= threshold_idf:
            storage_idf.add(key)

    # TF IDF
    # for idx_doc, dict_words in tfidf_dict.items():
    #     for key, value in dict_words.items():
    #         if value >= threshold_tfidf:
    #             storage_tfidf.add(key)

    for idx_doc, dict_words in tfidf_dict.items():
        temp = set()
        for key, value in dict_words.items():
            if value >= threshold_tfidf:
                temp.add(key)
            
        storage_tfidf[idx_doc] = temp

    return storage_idf, storage_tfidf

In [362]:
def weighted_filter(data, id, mapper_idf=None, mapper_tfidf=None):
    # Define the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    def preprocessing(text):
        text = remove_extra_spaces(text)
        text = expand_contractions(text)
        text = remove_non_ascii(text)

        # Get token of words
        doc = nlp(text)
        result_obj = []
        result_verb_adj = []
        for token in doc:
            t = token.lemma_.lower()
            # If only contains special characters or numbers and length less than 3
            if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
                continue
            # If the token is adjective, noun, propn, or verb
            if token.pos_ in ['NOUN', 'PROPN']:
                result_obj.append(t)
            elif token.pos_ in ['ADJ', 'VERB']:
                result_verb_adj.append(t)
            else:
                continue

        return result_obj, result_verb_adj
        
    if not mapper_idf and not mapper_tfidf:
        return data

    mapper = mapper_idf | mapper_tfidf[id]
    temp = {}
    for idx, element in data.items():
        temp[idx] = []
        for d in element:
            text = ' '.join(d)
            compare_obj, compare_verb_adj = preprocessing(text)

            is_object_pass = False
            is_verb_adj_pass = False

            # Handling object
            if len(compare_obj) == 0:
                is_object_pass = True
            else:
                for w in compare_obj:
                    if w in mapper:
                        is_object_pass = True
                        break
            
            # Handling verb ajective
            if not is_verb_adj_pass:
                for w in compare_verb_adj:
                    if w in mapper:
                        is_verb_adj_pass = True
                        break

            if is_object_pass and is_verb_adj_pass:
                temp[idx].append(d)

            # If object passed and verb-adj passed ==> True
            # If object passed but verb-adj not passed ==> False
            # If object not passed ==> False
    return temp



In [363]:
def weighted_filter_aspect(data, id, mapper_idf=None, mapper_tfidf=None):

    if not mapper_idf and not mapper_tfidf:
        return data

    mapper = mapper_idf | mapper_tfidf[id]
    temp = []
    for d in data:
        doc = nlp(d)
        for token in doc:
            if token.text.lower() in mapper:
                temp.append(d)
                break
    return temp

In [364]:
def important_words_aspect(dict_doc, idx_doc, mapper_1=None, mapper_2=None):
    # If mapper_1 and mapper_2 is None, do not filter it.
    if not mapper_1 and not mapper_2:
        return dict_doc

    # Copy dictionary
    dictionary = dict_doc.copy()
    
    # Get mapper based on its document.
    if mapper_2 :
        mapper_2 = mapper_2[idx_doc]

    for key, value in dictionary.items():
        temp = []
        for v1 in value:
            # Since it could be multiple word, we must check one by one
            for v in v1.split():
                # If aspect is in mapper_1 or mapper_2 then keep it
                if v in mapper_1 or v in mapper_2:
                    # Append full value
                    temp.append(v1)
                    break

        # Update list of string
        dictionary[key] = temp
    
    return dictionary

In [365]:
# Helper Function for preprocessing data

# Function to check if a specific word exists in text
def word_exists(word, text):
    # Create the regex pattern with word boundaries
    pattern = r'\b' + re.escape(word) + r'\b'
    
    # Search for the word in the text
    if re.search(pattern, text, re.IGNORECASE):
        return True
    else:
        return False

def flatten_data(x):
    return [item for sublist in x.values() for item in sublist]

def contraction(x):
    flatten = flatten_data(x)

    if len(flatten) > 0:
        temp = [item[0] + ' ' + item[1] for item in flatten]
        return '. '.join(temp) + '.'
    return ''

In [366]:
# Apply extraction

def process_ability(x):
    # Prepare sentence
    texts = remove_extra_spaces(x)
    texts = expand_contractions(x)
    texts = remove_non_ascii(x)

    # Get aspect
    doc = nlp(texts)
    mapper_pron_ant = get_mapper_pron_ant(doc)
    result = get_raw_abilities(doc)
    
    return result

df['ability'] = df['review'].apply(process_ability)
df['ability']

0     {0: [('staff', 'were helpful', ('helpful',), '...
1     {0: [('process', 'was smooth', ('smooth',), 'A...
2     {0: [('selection', 'is amazing', ('amazing',),...
3     {0: [('I', 'appreciate walked me', ('appreciat...
4     {0: [('I', 'left with phone', ('leave',), 'VER...
5     {0: [('I', 'could not resist upgrading', ('res...
6     {0: [('I', 'expected', ('expect',), 'VERB'), (...
7     {0: [('experience', 'really know stuff', ('kno...
8     {0: [('variety', 'was impressive', ('impressiv...
9                                               {0: []}
10    {0: [('staff', 'was knowledgeable', ('knowledg...
11    {0: [('price', 'were reasonable', ('reasonable...
12    {0: [('staff', 'really went mile', ('go',), 'V...
13    {0: [('service', 'be excellent', ('excellent',...
14    {0: [('deal', 'was friendly', ('friendly',), '...
15    {0: [('I', 'love store', ('love',), 'VERB')], ...
16    {0: [('I', 'got good deal on old phone', ('get...
17    {0: [('service', 'are quick', ('quick',), 

In [367]:
corpus = df['ability'].apply(contraction).values

corpus

array(['staff were helpful. staff were patient.',
       'process was smooth. process was quick.',
       'selection is amazing. price are competitive.',
       'I appreciate walked me. I appreciate walked through setting new device.',
       'I left with phone. question answered.',
       'I could not resist upgrading. I could not resist offer amazing deals on phones.',
       'I expected. technician fixed phones issue. technician fixed faster.',
       'experience really know stuff.',
       'variety was impressive. I found perfect phone case.', '',
       'staff was knowledgeable.', 'price were reasonable.',
       'staff really went mile.',
       'service be excellent. they helped find.',
       'deal was friendly. staff was friendly.',
       'I love store. buying be experience. fixing be experience.',
       'I got good deal on old phone.',
       'service are quick. service are reliable.',
       'staff was helpful in setting phone.', 'I m satisfied.',
       'staff was patient

In [368]:
documents = corpus

mapper_1, mapper_2 = get_words(documents)

In [369]:
mapper_1

{'always',
 'amazing',
 'appreciate',
 'break',
 'buy',
 'charge',
 'could',
 'deal',
 'discount',
 'even',
 'excellent',
 'experience',
 'extra',
 'fantastic',
 'fast',
 'feel',
 'find',
 'fix',
 'get',
 'give',
 'good',
 'great',
 'help',
 'helpful',
 'issue',
 'know',
 'leave',
 'long',
 'look',
 'lot',
 'love',
 'make',
 'need',
 'new',
 'offer',
 'one',
 'patient',
 'phone',
 'plan',
 'price',
 'process',
 'promotion',
 'purchase',
 'question',
 'quick',
 'quickly',
 'really',
 'refund',
 'refuse',
 'reliable',
 'repair',
 'resolve',
 'return',
 'rude',
 'seem',
 'selection',
 'service',
 'set',
 'staff',
 'store',
 'take',
 'technician',
 'time',
 'unhelpful',
 'variety',
 'wait',
 'warranty',
 'within',
 'work'}

In [370]:
mapper_2

{0: set(),
 1: {'process'},
 2: set(),
 3: {'walk'},
 4: {'answer'},
 5: {'resist'},
 6: {'technician'},
 7: set(),
 8: set(),
 9: set(),
 10: {'knowledgeable'},
 11: {'reasonable'},
 12: {'mile'},
 13: set(),
 14: {'friendly'},
 15: {'experience'},
 16: {'old'},
 17: {'service'},
 18: {'helpful', 'set'},
 19: {'satisfied'},
 20: {'patient', 'question'},
 21: {'simple', 'thrilled'},
 22: {'choose'},
 23: {'minute'},
 24: {'outstanding'},
 25: {'hand'},
 26: {'comfortable'},
 27: set(),
 28: {'fantastic'},
 29: {'informative', 'learn'},
 30: {'excellent'},
 31: set(),
 32: {'quick', 'set'},
 33: set(),
 34: set(),
 35: {'save'},
 36: set(),
 37: {'notch'},
 38: {'fail', 'recommend'},
 39: {'take', 'time'},
 40: {'place'},
 41: {'worth'},
 42: {'able'},
 43: set(),
 44: {'accommodate'},
 45: {'program'},
 46: {'efficient', 'service'},
 47: {'transfer'},
 48: {'buy'},
 49: set(),
 50: {'reliable'},
 51: {'offer'},
 52: {'great', 'selection'},
 53: {'resolve'},
 54: {'organize'},
 55: {'lo

**Important Sentence Extraction**

In [371]:
# NOTE: FILTER FOR SENTENCES ==> CHANGE WITH LSI SUMMARIZATION.

# df['ability_filtered'] = [weighted_filter(data, id=id, mapper_idf=mapper_1, mapper_tfidf=mapper_2) for id, data in enumerate(df['ability'])]

**Aspect Extraction**

In [372]:
# def find_matched_patterns(input_string, patterns):
#     # Create a regex pattern by joining the list of patterns using the '|' (OR) operator
#     regex_pattern = '|'.join(map(re.escape, patterns))
    
#     # Use re.findall to find all matches in the input string
#     matches = re.findall(regex_pattern, input_string)
    
#     return matches

# # Example usage
# input_string = "This is an apple, melon dragon, and orange"
# patterns = ['apple', 'an apple', 'melon dragon', 'melon', 'dragon', 'manggoo', 'pink']

# matched_patterns = find_matched_patterns(input_string, patterns)
# print(matched_patterns)


In [376]:
# Aspect extraction rules

def find_matched_patterns(input_string, patterns):
    # Create a regex pattern by joining the list of patterns using the '|' (OR) operator
    regex_pattern = '|'.join(map(re.escape, patterns))
    
    # Use re.findall to find all matches in the input string
    matches = re.findall(regex_pattern, input_string)
    
    return matches

def process_aspect_rules(data, id):
    subject_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they']
    
    # Data is converted into flatten
    data = flatten_data(data)
    # Get text and subject of data
    if len(data) > 0:
        text, subjects, keywords, labels = zip(*[(item[0] + ' ' + item[1] + '.', item[0], item[2], item[3]) for item in data])
    else: 
        text, subjects, keywords, labels = ('', '', '', '')


    
    # Convert text into string    
    doc = nlp('. '.join(text))

    # Extract from rules ==> Weighted pattern extraction
    result = get_raw_aspects(doc)
    result = prunning_aspect(result, doc)
    # Get aspects
    aspects = list(result.keys())
    # Weighted filter
    aspects = weighted_filter_aspect(aspects, id, mapper_idf=mapper_1, mapper_tfidf=mapper_2)

    # update_storage = {}
    storage = {}
        
    for i in range(len(data)):
        s, t, k, l = subjects[i].lower(), text[i], keywords[i], labels[i]
        
        # If subject is pron continue
        if s in subject_pronouns:
            continue

        # Temporary storage
        temp = {'ADJ': [],
               'VERB': [],
               'OTHER': [],}

        # Add keywords and text
        temp[l].append((k, t))
        
        # Get matched aspects and Filtering
        matched = []
        if len(aspects) > 0:
            matched += find_matched_patterns(t, aspects)
            matched = [a for a in matched if not word_exists(s, a)]
        matched.append(s)
            
        # Storing
        for a in matched:
            storage[a] = temp
            # if not storage.get(a):
            #     storage[a] = [t]
            # else:
            #     storage[a].append(t)
                    
    return storage

aspects = [process_aspect_rules(data, id) for id, data in enumerate(df['ability'].values)]

print(len(aspects))
aspects

99


[{'staff': ['staff were helpful.', 'staff were patient.']},
 {'process': ['process was smooth.', 'process was quick.']},
 {'selection': ['selection is amazing.'], 'price': ['price are competitive.']},
 {},
 {'question': ['question answered.']},
 {},
 {'phones issue': ['technician fixed phones issue.'],
  'technician': ['technician fixed phones issue.',
   'technician fixed faster.']},
 {'experience': ['experience really know stuff.']},
 {'variety': ['variety was impressive.']},
 {},
 {'staff': ['staff was knowledgeable.']},
 {'price': ['price were reasonable.']},
 {'staff': ['staff really went mile.']},
 {'service': ['service be excellent.']},
 {'deal': ['deal was friendly.'], 'staff': ['staff was friendly.']},
 {'buying': ['buying be experience.'], 'fixing': ['fixing be experience.']},
 {},
 {'service': ['service are quick.', 'service are reliable.']},
 {'phone': ['staff was helpful in setting phone.'],
  'staff': ['staff was helpful in setting phone.']},
 {},
 {'staff': ['staff was p

In [377]:
df['aspects'] = aspects

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   review   99 non-null     object
 1   ability  99 non-null     object
 2   aspects  99 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB
None


Unnamed: 0,review,ability,aspects
0,"The staff were incredibly helpful and patient,...","{0: [('staff', 'were helpful', ('helpful',), '...","{'staff': ['staff were helpful.', 'staff were ..."
1,I had a great experience purchasing my phone h...,"{0: [('process', 'was smooth', ('smooth',), 'A...","{'process': ['process was smooth.', 'process w..."
2,"Their selection of phones is amazing, and the ...","{0: [('selection', 'is amazing', ('amazing',),...","{'selection': ['selection is amazing.'], 'pric..."
3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked me', ('appreciat...",{}
4,"Great customer service, I left with the phone ...","{0: [('I', 'left with phone', ('leave',), 'VER...",{'question': ['question answered.']}


In [375]:
# Custom handling aspect

fail = df[df.apply(lambda x: len(x['aspects']) == 0, axis=1)]
print(len(fail))
fail

41


Unnamed: 0,review,ability,aspects
3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked me', ('appreciat...",{}
5,"They offer amazing deals on phones, I couldn’t...","{0: [('I', 'could not resist upgrading', ('res...",{}
9,Upgrading my phone was a breeze thanks to thei...,{0: []},{}
16,I got a really good trade-in deal on my old ph...,"{0: [('I', 'got good deal on old phone', ('get...",{}
19,"Very professional and friendly service, I’m su...","{0: [('I', 'm satisfied', ('m',), 'VERB')]}",{}
22,They helped me choose a phone within my budget...,"{0: [('they', 'helped choose within budget', (...",{}
27,"I found exactly what I needed, and they helped...","{0: [('I', 'found needed what', ('find', 'need...",{}
31,"The phone I bought here is working perfectly, ...",{0: []},{}
32,"They were super quick in setting up my phone, ...","{0: [('they', 'were quick in setting phone', (...",{}
33,"Always come here for upgrades, they never disa...",{0: []},{}


In [None]:
# Aspect extraction rules

def find_matched_patterns(input_string, patterns):
    # Create a regex pattern by joining the list of patterns using the '|' (OR) operator
    regex_pattern = '|'.join(map(re.escape, patterns))
    
    # Use re.findall to find all matches in the input string
    matches = re.findall(regex_pattern, input_string)
    
    return matches

def process_aspect_rules(data, id):
    subject_pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they']
    
    # Data is converted into flatten
    data = flatten_data(data)
    # Get text and subject of data
    if len(data) > 0:
        text, subjects, keywords, labels = zip(*[(item[0] + ' ' + item[1] + '.', item[0], item[2], item[3]) for item in data])
    else: 
        text, subjects, keywords, labels = ('', '', '', '')


    
    # Convert text into string    
    doc = nlp('. '.join(text))

    # Extract from rules ==> Weighted pattern extraction
    result = get_raw_aspects(doc)
    result = prunning_aspect(result, doc)
    # Get aspects
    aspects = list(result.keys())
    # Weighted filter
    aspects = weighted_filter_aspect(aspects, id, mapper_idf=mapper_1, mapper_tfidf=mapper_2)

    # update_storage = {}
    storage = {}
        
    for i in range(len(data)):
        s, t, k, l = subjects[i].lower(), text[i], keywords[i], labels[i]
        
        # If subject is pron continue
        if s in subject_pronouns:
            continue

        # Temporary storage
        temp = {'ADJ': [],
               'VERB': [],
               'OTHER': [],}

        # Add keywords and text
        temp[l].append((k, t))
        
        # Get matched aspects and Filtering
        matched = []
        if len(aspects) > 0:
            matched += find_matched_patterns(t, aspects)
            matched = [a for a in matched if not word_exists(s, a)]
        matched.append(s)
            
        # Storing
        for a in matched:
            storage[a] = temp
            # if not storage.get(a):
            #     storage[a] = [t]
            # else:
            #     storage[a].append(t)
                    
    return storage

aspects = [process_aspect_rules(data, id) for id, data in enumerate(df['ability'].values)]

print(len(aspects))
aspects

In [354]:
# FROM ANALYSIS PATTERN MANUAL: SOON NEED TO COREFERENCE RESOLUTION.

# SET IT AS EMPTY STRING BY DEFAULT
# def process_aspect_custom(text):
#     return []
    
def process_aspect_custom(data):
    # Data is converted into flatten
    data = flatten_data(data)

    # Get text and subject of data
    if len(data) > 0:
        text, subjects, keywords, labels = zip(*[(item[0] + ' ' + item[1] + '.', item[0], item[2], item[3]) for item in data])
    else: 
        text, subjects, keywords, labels = ('', '', '', '')


    storage = {}
    for i in range(len(data)):
        s, t, k, l = subjects[i].lower(), text[i], keywords[i], labels[i]
        
        # Temporary storage
        temp = {'ADJ': [],
               'VERB': [],
               'OTHER': [],}

        # Add keywords and text
        temp[l].append((k, t))
        

        if s in ['i', 'this']:
            s = 'staff'
            storage[s] = temp
            # if not storage.get(s):
            #     storage[s] = [t]
            # else:
            #     storage[s].append(t)
                
        elif s in ['they', 'you']:
            s = 'store'
            storage[s] = temp
            # if not storage.get(s):
            #     storage[s] = [t]
            # else:
            #     storage[s].append(t)
                    
    
    return storage

def process_aspect(data, id):
    result = process_aspect_rules(data, id)
    if len(result) == 0:
        result = process_aspect_custom(data)

    return result



# aspects = [process_aspect(text, id) for id, text in enumerate(corpus)]
aspects = [process_aspect(data, id) for id, data in enumerate(df['ability'].values)]
df['aspects'] = aspects
df.head()

[('staff', 'were helpful', ('helpful',), 'ADJ'), ('staff', 'were patient', ('patient',), 'ADJ')]
[('process', 'was smooth', ('smooth',), 'ADJ'), ('process', 'was quick', ('quick',), 'ADJ')]
[('selection', 'is amazing', ('amazing',), 'ADJ'), ('price', 'are competitive', ('competitive',), 'ADJ')]
[('I', 'appreciate walked me', ('appreciate', 'walk'), 'VERB'), ('I', 'appreciate walked through setting new device', ('appreciate', 'walk'), 'VERB')]
[('I', 'left with phone', ('leave',), 'VERB'), ('question', 'answered', ('answer',), 'VERB')]
[('I', 'could not resist upgrading', ('resist',), 'VERB'), ('I', 'could not resist offer amazing deals on phones', ('resist', 'offer'), 'VERB')]
[('I', 'expected', ('expect',), 'VERB'), ('technician', 'fixed phones issue', ('fix',), 'VERB'), ('technician', 'fixed faster', ('fix',), 'VERB')]
[('experience', 'really know stuff', ('know',), 'VERB')]
[('variety', 'was impressive', ('impressive',), 'ADJ'), ('I', 'found perfect phone case', ('find',), 'VERB')]


Unnamed: 0,review,ability,aspects
0,"The staff were incredibly helpful and patient,...","{0: [('staff', 'were helpful', ('helpful',), '...","{'staff': ['staff were helpful.', 'staff were ..."
1,I had a great experience purchasing my phone h...,"{0: [('process', 'was smooth', ('smooth',), 'A...","{'process': ['process was smooth.', 'process w..."
2,"Their selection of phones is amazing, and the ...","{0: [('selection', 'is amazing', ('amazing',),...","{'selection': ['selection is amazing.'], 'pric..."
3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked me', ('appreciat...","{'staff': {'ADJ': [], 'VERB': [(('appreciate',..."
4,"Great customer service, I left with the phone ...","{0: [('I', 'left with phone', ('leave',), 'VER...",{'question': ['question answered.']}


In [355]:
fail = df[df.apply(lambda x: len(x['aspects']) == 0, axis=1)]
print(len(fail))
fail

8


Unnamed: 0,review,ability,aspects
9,Upgrading my phone was a breeze thanks to thei...,{0: []},{}
31,"The phone I bought here is working perfectly, ...",{0: []},{}
33,"Always come here for upgrades, they never disa...",{0: []},{}
43,Got a great deal on my new phone and an awesom...,{0: []},{}
60,Bought a phone here that stopped working withi...,{0: []},{}
72,The phone I purchased here was overpriced comp...,{0: []},{}
96,Bought a refurbished phone that had several is...,{0: []},{}
98,"I’m extremely disappointed, will not be coming...",{0: []},{}


In [286]:
df.iloc[2]['ability']

{0: [('selection', 'is amazing', ('amazing',), 'ADJ'),
  ('price', 'are competitive', ('competitive',), 'ADJ')]}

**Create Pipeline Meta-Data Aspect based Sentence**

In [287]:
# Generate code

def pipeline_meta_data(input_):
    result = {}
    for id, data in enumerate(input_):
        temp = {}
        items = process_aspect(data, id)
        if len(items) > 0:
            for aspect, data in items.items():
                if not temp.get(aspect):
                    temp[aspect] = data
                else:
                    temp[aspect] += data
        result[id] = temp

    return result

meta_data = pipeline_meta_data(df['ability'].values)
meta_data

{0: {'staff': ['staff were helpful', 'staff were patient']},
 1: {'process': ['process was smooth', 'process was quick']},
 2: {'selection': ['selection is amazing'],
  'price': ['price are competitive']},
 3: {'new device': ['I appreciate walked through setting new device']},
 4: {'question': ['question answered']},
 5: {'staff': ['I could not resist upgrading',
   'I could not resist offer amazing deals on phones']},
 6: {'phones issue': ['technician fixed phones issue.'],
  'technician': ['technician fixed phones issue', 'technician fixed faster']},
 7: {'experience': ['experience really know stuff']},
 8: {'phone case': ['I found perfect phone case'],
  'variety': ['variety was impressive']},
 9: {},
 10: {'staff': ['staff was knowledgeable']},
 11: {'price': ['price were reasonable']},
 12: {'staff': ['staff really went mile']},
 13: {'service': ['service be excellent']},
 14: {'deal': ['deal was friendly'], 'staff': ['staff was friendly']},
 15: {'store': ['I love store.'],
  'ex

In [288]:
df = df[df.apply(lambda x: len(x['aspects']) != 0, axis=1)].copy().reset_index()

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   index    91 non-null     int64 
 1   review   91 non-null     object
 2   ability  91 non-null     object
 3   aspects  91 non-null     object
dtypes: int64(1), object(3)
memory usage: 3.0+ KB
None


Unnamed: 0,index,review,ability,aspects
0,0,"The staff were incredibly helpful and patient,...","{0: [('staff', 'were helpful', ('helpful',), '...","{'staff': ['staff were helpful', 'staff were p..."
1,1,I had a great experience purchasing my phone h...,"{0: [('process', 'was smooth', ('smooth',), 'A...","{'process': ['process was smooth', 'process wa..."
2,2,"Their selection of phones is amazing, and the ...","{0: [('selection', 'is amazing', ('amazing',),...","{'selection': ['selection is amazing'], 'price..."
3,3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked me', ('appreciat...",{'new device': ['I appreciate walked through s...
4,4,"Great customer service, I left with the phone ...","{0: [('I', 'left with phone', ('leave',), 'VER...",{'question': ['question answered']}


In [289]:
# # Save dictionary to a JSON file
# with open('data-1.json', 'w') as json_file:
#     json.dump(meta_data, json_file, indent=4)  # 'indent=4' makes the JSON pretty-printed

In [290]:
# df.to_excel("example.xlsx", index=False)
# df.to_csv("example.csv", index=False)