In [301]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy

import re
import os

In [247]:
nlp = spacy.load('en_core_web_lg')

In [248]:
df = pd.read_csv('./sample-mcd.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]

df['rating'] = df['rating'].apply(lambda x: int(x.split(" ")[0]))

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.3+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4
2,3,5 days ago,Made a mobile order got to the speaker and che...,1
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1


In [249]:
# Load Bing Liu's opinion word dictionary
bing_liu_opinion_words = set()  # Add the actual list of opinion words here

# Function to load opinion words from Bing Liu lexicon
def load_opinion_words(filepath):
    global bing_liu_opinion_words
    temp = pd.read_table(filepath, comment=';', header=None)[0].to_list()
    bing_liu_opinion_words = bing_liu_opinion_words.union(set(temp))


# Load opinion words
current_dir = os.getcwd()
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/negative-words.txt'))
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/positive-words.txt'))

In [250]:
# Contraction

from contractions import CONTRACTION_MAP

def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence
    

In [323]:
sample = df['review'].iloc[4]
# sample = 'It would McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. The staff and manager are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other not fast food places.'
# sample = 'The staff and manager are no friendly, accommodating and no smiling.'
sample = 'I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.'
sample = expand_contractions(sample)
print(sample)

I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.


In [324]:
sample = remove_extra_spaces(sample)
sample = expand_contractions(sample)
doc = nlp(sample)

for idx, sent in enumerate(doc.sents):
    print(f'{idx + 1}. {sent}')

1. I had a normal transaction, everyone was chill and polite, but now i dont want to eat this.
2. Im trying not to think about what this milky white/clear substance is all over my food, i d
3. *** sure am not coming back.


In [325]:
from spacy import displacy

displacy.render(doc, style='dep')

In [280]:
temp = {
    'text': [],
    'pos': [],
    'tag': [],
    'dep': []
}

for token in doc:
    temp['text'].append(token.text)
    temp['pos'].append(token.pos_)
    temp['tag'].append(token.tag_)
    temp['dep'].append(token.dep_)

pd.DataFrame.from_dict(temp)

Unnamed: 0,text,pos,tag,dep
0,It,PRON,PRP,nsubj
1,would,AUX,MD,ROOT
2,McDonalds,PROPN,NNP,attr
3,.,PUNCT,.,punct
4,It,PRON,PRP,nsubj
5,is,AUX,VBZ,ROOT
6,what,PRON,WP,attr
7,it,PRON,PRP,nsubj
8,is,AUX,VBZ,ccomp
9,as,ADV,RB,advmod


In [307]:
##========== PREPARATION TEXT ===========##
def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()

##=========== EXTRACT ASPECT ============##
# Cross product two lists
def cross_product(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: string
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append(i + ' ' + j)
    return temp

# Get neglection
def get_neglect(token):
    for t in token.children:
        if (t.dep_ == 'neg') or (t.dep_ == 'det' and t.text.lower() == 'no'):
            return t.text
    return ''
    
# Get token conjugation
def get_token_conj(token):
    for t in token.children:
        if t.dep_ == 'conj':
            return t

    return None

# Get token coordinate conjugation
def get_token_cc(token):
    for t in token.children:
        if t.dep_ == 'cc':
            return t
    return None

# Crawling all possibile conjugation
def extract_conj(token, neglect=False, lemma=False, all_token=False):
    result = []
    current = get_token_conj(token)
    coor = get_token_cc(token)
    while current:
        # If all_token=True, get all token include coordinate conjugation
        if all_token:
            # If coordinate cojugation exist
            if coor:
                result.append(coor)
            result.append(current)
            # Update coor to the current token
            coor = get_token_cc(current)
        # Else: Only extract text of conjug
        else:        
            if neglect:
                neg = get_neglect(current)
                # If lemma
                if lemma:
                    text = (neg + ' ' + current.lemma_).strip()
                else:
                    text = (neg + ' ' + current.text).strip()
                    
                result.append(text)
            else:
                result.append(current.text)
        current = get_token_conj(current)

    return result

# Get sentences that include coordinating conjunction and its conjuct
def get_text_conj(token):
    # Get all sentence of series include the conjugation
    tokens = [token]
    # Get all token
    tokens += extract_conj(token, all_token=True)

    text = ''
    for i, t in enumerate(tokens):
        text = text + t.text
        if i < len(tokens) - 1:
            if t.dep_ == 'cc':
                text += ' '
            else:
                text += ', '

    # text = text.strip()
    return text


# # Get the sentence point mapper
# def get_sentence_mapper():
#     sentence_point = {}
#     for i, s in enumerate(doc.sents):
#         sentence_point[i] = (s.start, s.end)
#     return sentence_point
    
# # Get location sentence
# sentence_mapper = get_sentence_mapper(doc)

def get_sentence_location(mapper, position):
    for s in mapper.keys():
        interval = mapper[s]
        if position >= interval[0] and position < interval[1]:
            return s
            
# Extract all raw aspects
def get_raw_aspect(doc):
    # Define global variables
    global bing_liu_opinion_words
    
    # Define local variables
    storage = []

    # Going through all token
    for idx, token in enumerate(doc):
        # If the word is noun and preceded by an adjective
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'ADJ'):
            # If the adjective is an opinion
            if doc[idx - 1].text not in bing_liu_opinion_words:
                # Concatenate adj + word then add to storage
                text = doc[idx - 1].text + ' ' + token.text
                storage.append((text, idx - 1, idx + 1))
            else:
                # Else, add noun only
                text = token.text
                storage.append((text, idx, idx + 1))
            continue
        # If the word is noun and preceded by another noun
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'NOUN'):
            text = doc[idx - 1].text + ' ' + token.text
            storage.append((text, idx - 1, idx + 1))

        # If the word is noun and direct object
        if token.pos_ == 'NOUN' and token.dep_ == 'dobj':
            text = token.text
            storage.append((text, idx, idx + 1))
    
        # If the word is noun and a subject of sentence
        if token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            text = token.text
            storage.append((text, idx, idx + 1))
    
        # If the word is noun and a conj of another noun
        if (token.pos_ == 'NOUN' and token.dep_ == 'conj') and (token.head.pos_ == 'NOUN'):
            text = token.text
            storage.append((text, idx, idx + 1))
    
        # If the sentence contains SUBJECT VERB, then makes it true
        if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
            is_contain_subject_verb = True
    
        # If token is word that contain pre-modifier
        # if (token.dep_ == 'amod' and token.head.pos_ == 'NOUN'):
        #     if token.head.i != idx + 1:
        #         continue
        #     text = token.text + ' ' + token.head.text
        #     storage.append((text, idx, token.head.i + 1))
    
        # # If token is word that contain post-modifier
        # if (token.dep_ == 'pobj' and token.pos_ == 'NOUN'):
        #     if token.head.dep_ == 'prep' and token.head.head.pos_ == 'NOUN':
        #         text = token.head.head.text + ' ' + token.head.text + ' ' + token.text
        #         start = token.head.head.i
        #         storage.append((text, start, idx + 1))
            
        
        # If token is adverb modifier and its head is NOUN then store it.
        if (token.dep_ == 'advmod' and token.head.pos_ == 'NOUN'):
            text = token.head.text + ' ' + token.text
            storage.append((text, token.head.i, idx + 1))
            # adv_adj_mod.append((text, idx, idx + 1))

    # Sort storage
    storage = list(set(storage))
    storage = sorted(storage, key=lambda x: (x[1], x[0]))

    return storage

# Prunning raw aspect
def prunning_aspect(list_):
    # Define local variables
    drop_idx = []
    storage = {}
    
    # Get sentence mapper and prepare storage
    sentence_points = {}
    for i, s in enumerate(doc.sents):
        sentence_points[i] = (s.start, s.end)
        storage[i] = []

    for idx, item in enumerate(list_):
        # As long as current idx does not more than maximum list_ index
        if idx != len(list_) - 1:
            # Get the next item
            next_item = list_[idx + 1]
            # If current item start position and next item end position are overlapping
            if item[-1] - 1 == next_item[1]:
                # We merge the text based on last text in current item and first text in next item
                append_text = ' '.join(next_item[0].split()[1:])
                # Update next item values
                new_text = item[0] + ' ' + append_text
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
            
            # If current item start position = next item end position (They are next to each other)
            if item[-1] == next_item[1]:
                # Update the next value (do not have to merge the text based on specific text).
                new_text = item[0] + ' ' + next_item[0]
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
                
    list_ = [list_[i] for i in range(len(list_)) if i not in drop_idx]

    # Create return as mapper
    for i, s in enumerate(list_):
        text, start, end = s
        sentence_location = get_sentence_location(sentence_points, start)
        # Update value
        storage[sentence_location].append(text)
    return storage
    
prunning_aspect(get_raw_aspect(doc))

{0: [],
 1: ['food', 'atmosphere'],
 2: ['staff here', 'difference'],
 3: [],
 4: ['experience', 'food places']}

In [281]:
# EXTRACT ASPECT
storage = []

# adv = []
# adj_mod = []
adv_adj_mod = []
is_contain_subject_verb = False
# sent = list(doc.sents)[0]
for idx, token in enumerate(doc):
    # If the word is noun and preceded by an adjective
    if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'ADJ'):
        # If the adjective is an opinion
        if doc[idx - 1].text not in bing_liu_opinion_words:
            # Concatenate adj + word then add to storage
            text = doc[idx - 1].text + ' ' + token.text
            storage.append((text, idx - 1, idx + 1))
        else:
            # Else, add noun only
            text = token.text
            storage.append((text, idx, idx + 1))
    
    # If the word is noun and preceded by another noun 
    if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'NOUN'):
        text = doc[idx - 1].text + ' ' + token.text
        storage.append((text, idx - 1, idx + 1))
        
    # If the word is noun and direct object
    if token.pos_ == 'NOUN' and token.dep_ == 'dobj':
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the word is noun and a subject of sentence
    if token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the word is noun and a conj of another noun
    if (token.pos_ == 'NOUN' and token.dep_ == 'conj') and (token.head.pos_ == 'NOUN'):
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the sentence contains SUBJECT VERB, then makes it true
    if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
        is_contain_subject_verb = True

    # If token is word that contain pre-modifier
    # if (token.dep_ == 'amod' and token.head.pos_ == 'NOUN'):
    #     if token.head.i != idx + 1:
    #         continue
    #     text = token.text + ' ' + token.head.text
    #     storage.append((text, idx, token.head.i + 1))

    # # If token is word that contain post-modifier
    # if (token.dep_ == 'pobj' and token.pos_ == 'NOUN'):
    #     if token.head.dep_ == 'prep' and token.head.head.pos_ == 'NOUN':
    #         text = token.head.head.text + ' ' + token.head.text + ' ' + token.text
    #         start = token.head.head.i
    #         storage.append((text, start, idx + 1))
        
    
    # If token is adverb modifier and its head is NOUN then store it.
    if (token.dep_ == 'advmod' and token.head.pos_ == 'NOUN'):
        text = token.head.text + ' ' + token.text
        storage.append((text, token.head.i, idx + 1))
        # adv_adj_mod.append((text, idx, idx + 1))
        
# If sentence contains SUBJECT VERB
# if is_contain_subject_verb:
#     # if adjective modifier and adverb is opinion word, append to storage
#     for e in adv_adj_mod:
#         storage.append(e)

# Fix the multiple data

# Sort storage
storage = list(set(storage))
storage = sorted(storage, key=lambda x: (x[1], x[0]))

storage

[('food', 13, 14),
 ('atmosphere', 15, 16),
 ('staff', 19, 20),
 ('staff here', 19, 21),
 ('difference', 24, 25),
 ('experience', 41, 42),
 ('food', 46, 47),
 ('food places', 46, 48)]

In [282]:
# PRUNNING
drop_idx = []
for idx, item in enumerate(storage):
    print(idx + 1, item)
    if idx != len(storage) - 1:
        next_item = storage[idx + 1]
        if item[-1] - 1 == next_item[1]:
            append_text = ' '.join(next_item[0].split()[1:])
            new_start = item[1]
            new_end = next_item[-1]
            new_text = item[0] + ' ' + append_text
            drop_idx.append(idx)
            storage[idx + 1] = (new_text, new_start, new_end)

        if item[-1] == next_item[1]:
            new_start = item[1]
            new_end = next_item[-1]
            new_text = item[0] + ' ' + next_item[0]
            drop_idx.append(idx)
            storage[idx + 1] = (new_text, new_start, new_end)

storage = [storage[i] for i in range(len(storage)) if i not in drop_idx]
storage

1 ('food', 13, 14)
2 ('atmosphere', 15, 16)
3 ('staff', 19, 20)
4 ('staff here', 19, 21)
5 ('difference', 24, 25)
6 ('experience', 41, 42)
7 ('food', 46, 47)
8 ('food places', 46, 48)


[('food', 13, 14),
 ('atmosphere', 15, 16),
 ('staff here', 19, 21),
 ('difference', 24, 25),
 ('experience', 41, 42),
 ('food places', 46, 48)]

In [283]:
# GET THE ABILITY OF EACH CONTEXT

# Cross product two lists
def cross_product(first, second):
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append(i + ' ' + j)
    return temp

# Get neglection
def get_neglect(token):
    for t in token.children:
        if (t.dep_ == 'neg') or (t.dep_ == 'det' and t.text.lower() == 'no'):
            return t.text
    return ''
    
# Check children contain conjugation
def get_token_conj(token):
    for t in token.children:
        if t.dep_ == 'conj':
            return t

    return None

def get_token_cc(token):
    for t in token.children:
        if t.dep_ == 'cc':
            return t
    return None

# Crawling all possibile conjugation
def extract_conj(token, neglect=False, lemma=False, all_token=False):
    result = []
    current = get_token_conj(token)
    coor = get_token_cc(token)
    while current:
        # If all_token=True, get all token include coordinate conjugation
        if all_token:
            # If coordinate cojugation exist
            if coor:
                result.append(coor)
            result.append(current)
            # Update coor to the current token
            coor = get_token_cc(current)
        # Else: Only extract text of conjug
        else:        
            if neglect:
                neg = get_neglect(current)
                # If lemma
                if lemma:
                    text = (neg + ' ' + current.lemma_).strip()
                else:
                    text = (neg + ' ' + current.text).strip()
                    
                result.append(text)
            else:
                result.append(current.text)
        current = get_token_conj(current)

    return result

# Get sentences that include coordinating conjunction and its conjuct
def get_text_conj(token):
    # Get all sentence of series include the conjugation
    tokens = [token]
    # Get all token
    tokens += extract_conj(token, all_token=True)

    text = ''
    for i, t in enumerate(tokens):
        text = text + t.text
        if i < len(tokens) - 1:
            if t.dep_ == 'cc':
                text += ' '
            else:
                text += ', '

    # text = text.strip()
    return text

for idx, token in enumerate(doc):
    text = ''
    subjects = []
    abilities = []

    # If the token is verb
    if token.pos_ == 'VERB':
        
        for t in token.children:
            # Check if the token children contain subject.
            if t.dep_ == 'nsubj':
                # text = text + t.text
                subjects.append(t.text)
                # Looping through the children of subject.
                subjects += extract_conj(t)

        # Check how many subject that refers current verb
        text = ', '.join(cross_product(subjects, token.lemma_))


    # If the token is aux
    if token.pos_ == 'AUX':
        
        # Check if the neglect exist and depend on token aux
        neg = get_neglect(token)
        # Looping through children
        for t in token.children:
            if t.dep_ == 'nsubj':
                # first_subject = (neg + ' ' + t.text).strip()
                subjects.append(t.text)     
                # Looping through the children of subject.
                subjects += extract_conj(t)

            if t.dep_ == 'acomp':
                # If neglection does not exist after aux, then check if it exist at first adj/verb
                if not neg:
                    neg = get_neglect(t)
                abilities.append(t.text)
                # Looping through the children of subject
                # If neglection does not appear in after aux or before first subject.
                #  Then check all neglection in first conjugation.
                if not neg:
                    abilities += extract_conj(t, neglect=True, lemma=True)
                else:
                    abilities += extract_conj(t, lemma=True)
                    abilities = cross_product(neg, abilities)

        if len(subjects) > 0 and len(abilities) > 0 :
            text = ', '.join(cross_product(subjects, abilities))

    # If the token is noun
    if token.pos_ == 'NOUN':
        for t in token.lefts:
            if t.pos_ == 'ADJ':
                neg = get_neglect(t)
                if neg:
                    abilities.append(neg + ' ' + t.lemma_)
                else:
                    abilities.append(t.lemma_)

        # If the token contain abilities, then we check is there any conjugation
        if len(abilities) > 0:
            subjects.append(token.text)
            subjects += extract_conj(token)
            text = ', '.join(cross_product(subjects, abilities))
        
            
    if len(text) > 0:
        print(text)

food go, atmosphere go
staff make
They friendly, They accommodate, They smile
experience pleasant
food fast
places many, places other


In [284]:
# Example
ex = 'The staff is not smiling'
ex = 'It would McDonalds. It is what it is as far as the food and atmosphere go.' +\
     'The staff here does make a difference.' +\
     'The staff and manager are not all friendly, accommodating and always smiling.' +\
     'Makes for a more pleasant experience than many other fast food places.'
ex = 'The staff and manager are no friendly, accommodating and no smiling'

doc2 = nlp(sample)

displacy.render(doc2, 'dep')

for token in doc2:
    temp = get_neglect(token)
    if temp:
        print(temp)

In [285]:
import re

def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()

example = "I want \t sleep"
print(example)
print(remove_extra_spaces(example))

I want 	 sleep
I want sleep


In [289]:
# 1. Identify Pronoun
# 2. Locate antecedent
# 3. Check agreement: singular/plural, gender, subject/object
# 4. Replace pronouns

# ex = 'Lisa went to the store because she needed groceries.'
# ex = "The students took their books to class."
# ex = 'John, Jimmy and Sarah said they would help me.'
# ex = 'John loves hiking. He goes to the mountains every weekend. His favorite trail is difficult but beautiful.'
ex = 'The students were excited about the field trip. They had been planning it for weeks. ' +\
     'Their teacher, Mrs. Brown, reminded them to bring lunches. She also told them to wear comfortable shoes.'

doc = nlp(sample)

displacy.render(doc, 'dep')

mapper = {}

antecedents = []
pron = []
est_loc = 0

# Get sentence interval point.
sentence_point = {}
for i, s in enumerate(doc.sents):
    sentence_point[i] = (s.start, s.end)

def get_sentence_location(position):
    for s in sentence_point.keys():
        interval = sentence_point[s]
        if position >= interval[0] and position < interval[1]:
            return s

# Filter sentence
def filter_sentence(_list, location):
    temp = []
    for e in _list:
        if e[-1] == location:
            temp.append(e)
    return temp

# Locate potential antecedents and pronouns (subject only)
for token in doc:
    
    # Condition potential antecedents
    # If the token is not pronouns and it's a subject
    if (token.pos_ != 'PRON') and (token.dep_ == 'nsubj'):
        start = token.i
        end = start + 1
        location_sentence = get_sentence_location(start)
        antecedents.append((token, start, location_sentence))
        # Check is there any conj
        # antecedents += extract_conj(token, only_token=True)

    # if (token.pos_ != 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
    #     start = token.i
    #     end = start + 1
    #     location_sentence = get_sentence_location(start)
    #     antecedents.append((token, start, location_sentence))
    #     # Check is there any conj
    #     # antecedents += extract_conj(token, only_token=True)    

    # Condition potential pronouns
    # Rule 1
    # If pron is subject (it could be same sentence or previously)
    if (token.pos_ == 'PRON') and (token.dep_ == 'nsubj'):
        # start = est_loc - len(token.text)
        # end = est_loc
        # start = ex.index(token.text)
        # end = start + len(token.text)
        start = token.i
        end = start + 1
        location_sentence = get_sentence_location(start)
        pron.append((token, start, location_sentence))
        
    # Rule 2
    # If pron is possesion (ant is subject in the same sentence)
    if (token.pos_ == 'PRON') and (token.dep_ == 'poss'):
        # start = est_loc - len(token.text)
        # end = est_loc
        start = token.i
        end = start + 1
        location_sentence = get_sentence_location(start)
        pron.append((token, start, location_sentence))

    # Rule 3
    # If pron is object
    # if (token.pos_ == 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
    #     start = token.i
    #     end = start + 1
    #     location_sentence = get_sentence_location(start)
    #     pron.append((token, start, location_sentence))


print(pron)
print(antecedents)

# Replacing pronouns (subject only)
result = None
if len(pron) > 0:
    for p in pron:
        # Current status
        is_success = False
        token_pron, index_pron, sent_pron = p
        current_sentence = sent_pron
        while current_sentence > -1:
            # Get the antecedents
            filter_antecedents = filter_sentence(antecedents, current_sentence)
            if len(filter_antecedents) > 0:
                for ant in filter_antecedents:
                    token_ant, index_ant, sent_ant = ant
                    if ('subj' in token_ant.dep_) and ('subj' in token_pron.dep_ or 'poss' in token_pron.dep_) and (index_ant < index_pron):
                        mapper[index_pron] = index_ant
                        is_success = True
                        break
                    # if ('obj' in token_ant.dep_ and 'obj' in token_pron.dep_) and (index_ant < index_pron):
                    #     mapper[index_pron] = index_ant
                    #     is_success = True
                    #     break
            if is_success:
                break
            current_sentence -= 1
print(mapper)

[(It, 0, 0), (It, 4, 1), (it, 7, 1), (They, 26, 3)]
[(food, 13, 1), (staff, 19, 2)]
{26: 19}


In [None]:
doc = nlp(result)

displacy.render(doc, 'dep')

In [None]:
# Check children contain conjugation
def get_token_conj(token):
    for t in token.children:
        if t.dep_ == 'conj':
            return t

    return None

def get_token_cc(token):
    for t in token.children:
        if t.dep_ == 'cc':
            return t
    return None

# Crawling all possibile conjugation
def extract_conj(token, neglect=False, lemma=False, all_token=False):
    result = []
    current = get_token_conj(token)
    coor = get_token_cc(token)
    while current:
        # If all_token=True, get all token include coordinate conjugation
        if all_token:
            # If coordinate cojugation exist
            if coor:
                result.append(coor)
            result.append(current)
            # Update coor to the current token
            coor = get_token_cc(current)
        # Else: Only extract text of conjug
        else:        
            if neglect:
                neg = get_neglect(current)
                # If lemma
                if lemma:
                    text = (neg + ' ' + current.lemma_).strip()
                else:
                    text = (neg + ' ' + current.text).strip()
                    
                result.append(text)
            else:
                result.append(current.text)
        current = get_token_conj(current)

    return result

# Get sentences that include coordinating conjunction and its conjuct
def get_text_conj(token):
    # Get all sentence of series include the conjugation
    tokens = [token]
    # Get all token
    tokens += extract_conj(token, all_token=True)

    text = ''
    for i, t in enumerate(tokens):
        text = text + t.text
        if i < len(tokens) - 1:
            if t.dep_ == 'cc':
                text += ' '
            else:
                text += ', '

    # text = text.strip()
    return text


get_text_conj(ant)

In [313]:
example = 'It is apple'

docex = nlp(example)

for token in docex:
    print(token.text, token.dep_)

It nsubj
is ROOT
apple attr
