In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load("en_core_web_md")
import re

from itertools import chain

In [2]:
#from spacy import displacy
#import deplacy

In [400]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc']

# getters that are not used as features
span_utilities = ['prev_unit', 'label', 'idx_start', 'idx_end']
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr', 'get_label_and_error']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    def get_label_and_error(unit, error_function='percentage_correctness'):
        """
        Inputs: unit

        Outputs: label for the unit and segmentation error

        """

        def overlap_case(unit_start, unit_end, adu_start, adu_end):
            if adu_start >= unit_start and adu_end <= unit_end:
                # Case 1, ADU is fully contained in UNIT
                return 1

            elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

                # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
                return 2

            elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:

                # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
                return 3

            elif adu_start < unit_start and adu_end > unit_end:

                # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
                return 4

            else: 
                # ADU does not overlap with UNIT
                return False


        def percentage_correctness(unit, adu_start, adu_end, overlap_case):

            if overlap_case==2:
                adu_start = unit._.idx_start
            if overlap_case==3:
                adu_end = unit._.idx_end

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            pct_correct = adu_ntokens/unit_ntokens
            return pct_correct

        def extended_accuracy(unit, adu_start, adu_end, overlap_case):
            # Compares number of tokens to get the the correct ADU in proportional with UNIT length

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            diff_ntokens = np.abs(unit_ntokens - adu_ntokens)

            return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))


        if error_function.lower() == 'percentage_correctness':
            err_func = percentage_correctness
        elif error_function.lower() == 'extended_accuracy':
            err_func = extended_accuracy

        unit_start = unit._.idx_start
        unit_end = unit._.idx_end

        essay_id = u.doc._.essay_id

        # DataFrame containing ADUs indices & labels, filtered for current essay_id
        adus_doc = adus[adus['essay_id'] == essay_id]



        label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
                          overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind'])),
                          #(row['start_ind'], row['end_ind'])
                           ) 
                         for row_ind, row in adus_doc.iterrows() 
                         if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    #     # Contains information of the ADUs that overlap with the UNIT
    #     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
    #     overlap_adus = [(row['start_ind'],
    #                      row['end_ind'], 
    #                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
    #                      row['ADU_type']) 
    #                      for row_ind, row in adus_doc.iterrows()
    #           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

        return label_and_error

    
    def get_label(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()
    
    def get_idx_start(unit):
        return unit[0].idx
    
    def get_idx_end(unit):
        return unit[-1].idx  + len(unit[-1])
    
    def get_label_pct(span):
        
        
        pass
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
        
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]



    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)
    
    

    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        pass
    if mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    if mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)

# Run
create_extensions(extensions_dict)   


In [401]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [402]:
# Pipelinev1

def text2fv(df):
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
        
    segmented_docs = [segmentation(doc, mode='sentence') for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([int(unit._.label) for unit in units])
    
    return X,y 

In [126]:
def print_adus(units):
    for i, u in enumerate(units):
            unit_start = u._.idx_start
            unit_end = u._.idx_end
            
            essay_id = u.doc._.essay_id
            
            
            doc_adus = adus[adus['essay_id'] == essay_id]
             
            lis = [((unit_start, unit_end),(row['start_ind'], row['end_ind'], row['ADU_type'],is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind']))) for row_ind,row in doc_adus.iterrows() if is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind'])]
            
            
            
            if len(lis)>0:
                print(i, lis)
                
print_adus(units)

45 [((259, 316), (311, 373, 'Claim', 'End_Cut'))]
46 [((261, 325), (311, 373, 'Claim', 'End_Cut'))]
47 [((264, 336), (311, 373, 'Claim', 'End_Cut'))]
48 [((267, 340), (311, 373, 'Claim', 'End_Cut'))]
49 [((274, 344), (311, 373, 'Claim', 'End_Cut'))]
50 [((276, 354), (311, 373, 'Claim', 'End_Cut'))]
51 [((280, 357), (311, 373, 'Claim', 'End_Cut'))]
52 [((287, 366), (311, 373, 'Claim', 'End_Cut'))]
53 [((295, 368), (311, 373, 'Claim', 'End_Cut'))]
54 [((298, 373), (311, 373, 'Claim', 'Full'))]
55 [((302, 374), (311, 373, 'Claim', 'Full'))]
56 [((303, 379), (311, 373, 'Claim', 'Full')), ((303, 379), (375, 521, 'Premise', 'End_Cut'))]
57 [((304, 384), (311, 373, 'Claim', 'Full')), ((304, 384), (375, 521, 'Premise', 'End_Cut'))]
58 [((309, 396), (311, 373, 'Claim', 'Full')), ((309, 396), (375, 521, 'Premise', 'End_Cut'))]
59 [((311, 399), (311, 373, 'Claim', 'Full')), ((311, 399), (375, 521, 'Premise', 'End_Cut'))]
60 [((317, 408), (311, 373, 'Claim', 'Start_Cut')), ((317, 408), (375, 521, 

236 [((1232, 1299), (1095, 1239, 'Premise', 'Start_Cut')), ((1232, 1299), (1262, 1299, 'Premise', 'Full'))]
237 [((1239, 1307), (1262, 1299, 'Premise', 'Full'))]
238 [((1241, 1312), (1262, 1299, 'Premise', 'Full')), ((1241, 1312), (1308, 1390, 'Premise', 'End_Cut'))]
239 [((1245, 1317), (1262, 1299, 'Premise', 'Full')), ((1245, 1317), (1308, 1390, 'Premise', 'End_Cut'))]
240 [((1248, 1321), (1262, 1299, 'Premise', 'Full')), ((1248, 1321), (1308, 1390, 'Premise', 'End_Cut'))]
241 [((1253, 1325), (1262, 1299, 'Premise', 'Full')), ((1253, 1325), (1308, 1390, 'Premise', 'End_Cut'))]
242 [((1260, 1330), (1262, 1299, 'Premise', 'Full')), ((1260, 1330), (1308, 1390, 'Premise', 'End_Cut'))]
243 [((1262, 1338), (1262, 1299, 'Premise', 'Full')), ((1262, 1338), (1308, 1390, 'Premise', 'End_Cut'))]
244 [((1267, 1347), (1262, 1299, 'Premise', 'Start_Cut')), ((1267, 1347), (1308, 1390, 'Premise', 'End_Cut'))]
245 [((1272, 1348), (1262, 1299, 'Premise', 'Start_Cut')), ((1272, 1348), (1308, 1390, 'Pre

In [250]:
def is_adu(unit_start, unit_end, adu_start, adu_end):
    
    if adu_start<=unit_start and adu_end <=unit_start:
        # ADU comes before UNIT
        return False
    elif adu_start>=unit_end and adu_end >=unit_end:
        # ADU comes after UNIT
        return False
    else:
        if adu_start >= unit_start and adu_end <= unit_end:
            #print("Fully Contains ADU")
            return "Full"
        elif adu_start <= unit_start and adu_end <=unit_end:
            
            #print("ADU start is cut")
            return "Start_Cut"
        elif adu_start >= unit_start and adu_end >= unit_end:
            # End of ADU is after UNIT
            return "End_Cut"

        elif adu_start <= unit_start and adu_end >= unit_end:

            # UNIT is smaller than ADU, ADU start and end are cut
            return "Both_Sides_Cut"

In [356]:
# EXTENSION WORKING HERE

def get_label_and_error(unit, error_function='percentage_correctness'):
    """
    Inputs: unit
    
    Outputs: label for the unit and segmentation error
    
    """
    
    def overlap_case(unit_start, unit_end, adu_start, adu_end):
        if adu_start >= unit_start and adu_end <= unit_end:
            # Case 1, ADU is fully contained in UNIT
            return 1
        
        elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

            # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
            return 2
        
        elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:
            
            # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
            return 3

        elif adu_start < unit_start and adu_end > unit_end:

            # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
            return 4
        
        else: 
            # ADU does not overlap with UNIT
            return False
        
    
    def percentage_correctness(unit, adu_start, adu_end, overlap_case):
        
        if overlap_case==2:
            adu_start = unit._.idx_start
        if overlap_case==3:
            adu_end = unit._.idx_end
        
        adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
        
        unit_ntokens = len(unit)
        adu_ntokens = len(adu)
        pct_correct = (unit_ntokens - adu_ntokens)/unit_ntokens
        
        return pct_correct
    
    def extended_accuracy(unit, adu_start, adu_end, overlap_case):
        # Compares number of tokens to get the the correct ADU in proportional with UNIT length
        
        adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
        
        unit_ntokens = len(unit)
        adu_ntokens = len(adu)
        diff_ntokens = np.abs(unit_ntokens - adu_ntokens)
        
        return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))
        
        
    if error_function.lower() == 'percentage_correctness':
        err_func = percentage_correctness
    elif error_function.lower() == 'extended_accuracy':
        err_func = extended_accuracy
    
    unit_start = unit._.idx_start
    unit_end = unit._.idx_end

    essay_id = u.doc._.essay_id

    # DataFrame containing ADUs indices & labels, filtered for current essay_id
    adus_doc = adus[adus['essay_id'] == essay_id]
    

    
    label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
                     overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']))) 
                     for row_ind, row in adus_doc.iterrows() 
                     if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]
    
#     # Contains information of the ADUs that overlap with the UNIT
#     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
#     overlap_adus = [(row['start_ind'],
#                      row['end_ind'], 
#                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
#                      row['ADU_type']) 
#                      for row_ind, row in adus_doc.iterrows()
#           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    return label_and_error
        
        

In [358]:
get_label_and_error(units[163], error_function='extended_accuracy')

[('Premise', 0.5), ('Premise', 0.3141427559625005)]

In [172]:
def extended_acc(d, s):

    return 1/((d+1)**(np.log2(d+1)/np.log2(s+1)))


# unit_start = units[54]._.idx_start
# unit_end = units[54]._.idx_end

adu = doc.char_span(311,373, alignment_mode='expand')
unit = units[54]







true.
First, using computer constantly has bad influence on children's eyes

(4,
 'End_Cut',
 
 First, using computer constantly has bad influence on children's eyes. When,
 When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted)

In [174]:
u123 = segmentation(doc)



[Computer has negative effects to children
 ,
 Nowadays, thanks to the development of technology, computer is now indispensable to life.,
 Some people think that computer is good for children and it should be used daily by children but some others think differently.,
 In my opinion, the latter opinion is true.,
 
 First, using computer constantly has bad influence on children's eyes.,
 When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted.,
 Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life.,
 It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences.,
 For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a non-stop playing for 3 days.,
 
 F

In [173]:
def segmentation_error(unit_start, unit_end, adu_start, adu_end, mode='extended_acc'):

    def extended_acc(d, s):

        return 1/((d+1)**(np.log2(d+1)/np.log2(s+1)))

    adu = doc.char_span(adu_start, adu_end, alignment_mode = 'expand')
    unit = doc.char_span(unit_start, unit_end) 
    
    unit_ntokens = len(unit)
    adu_ntokens = len(adu)
    
    d = np.abs(unit_ntokens - adu_ntokens)
    
    if adu_start<=unit_start and adu_end <=unit_start:
        # ADU comes before UNIT
        return False
    elif adu_start>=unit_end and adu_end >=unit_end:
        # ADU comes after UNIT
        return False
    else:
        if adu_start >= unit_start and adu_end <= unit_end:
            #print("Fully Contains ADU")
            error = extended_acc(d, unit_ntokens)
            cut_type "Full"
        elif adu_start <= unit_start and adu_end <=unit_end:
            
            #print("ADU start is cut")
            return "Start_Cut"
        elif adu_start >= unit_start and adu_end >= unit_end:
            # End of ADU is Cut
            return "End_Cut"

        elif adu_start <= unit_start and adu_end >= unit_end:

            # UNIT is smaller than ADU, ADU start and end are cut
            return "Both_Sides_Cut"

    

SyntaxError: invalid syntax (Temp/ipykernel_13064/507436367.py, line 25)

In [56]:
def verbose_print(units):
    # Detailed Printer
    for i, u in enumerate(units):
            span_start = u[0].idx
            span_end = u[-1].idx  + len(u[-1])

            lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],is_adu(span_start, span_end, row['start_ind'] ,row['end_ind']))) for row_ind,row in adu24.iterrows() if is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])]
            if len(lis)>0:
                print(i)
                print(lis,"\n")
                print("UNIT:",u,"\n")
                for ind, adu in enumerate(lis):

                    #print(adu[1][2].upper()+':',adu[1][0:2])
                    #print(doc.char_span(*adu[1][0:2]), "\n")
                    label = adu[1][2].upper() 
                    adu_range = adu[1][0:2]
                    adu_status = adu[1][3]
                    print(f'ADU #{ind+1}',label+':',*adu_range, adu_status)
                    print(doc.char_span(*adu_range), "\n")
                print("-----------------\n")

In [58]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

###### TEST
in_text = essays.iloc[23].text
doc = nlp(in_text)
doc._.essay_id = essays.iloc[23]['essay_id']
adu24 = adus[adus['essay_id'] == doc._.essay_id]
units=segmentation(doc, mode='n_grams', n_grams=15)
#units=segmentation(doc, mode='sentence')

In [54]:
def get_label_error(unit):
    
        essay_id = unit.doc._.essay_id
        unit_start = unit[0].idx
        #  + len(unit[-1]) to get to the end of the last word
        unit_end = unit[-1].idx  + len(unit[-1])
        adus_iter = adus[adus['essay_id'] == essay_id ].iterrows()
        adus_list = [ (unit._.index_in_doc, is_adu(unit_start, unit_end, row['start_ind'], row['end_ind']) , doc.char_span(unit_start, unit_end), doc.char_span(row['start_ind'], row['end_ind'])) for i, row in adus_iter if is_adu(unit_start, unit_end, row['start_ind'], row['end_ind'])]
        
        return adus_list
Span.set_extension('label_error', force=True, getter=get_label_error)

In [10]:
# for i, u in enumerate(units):
#     if u._.label:
#         span_start = u[0].idx
#         span_end = u[-1].idx  + len(u[-1])
        
#         lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],row_ind)) for row_ind,row in adu24.iterrows() if span_start<= row['start_ind'] and span_end>= row['end_ind']]
#         print(i,'--------------------------')
#         print(lis,"\n")
        
#         print("UNIT:",span_start, span_end)
#         print(u,"\n")
        
#         for adu in lis:
            
#             #print(adu[1][2].upper()+':',adu[1][0:2])
#             #print(doc.char_span(*adu[1][0:2]), "\n")
#             label = adu[1][2].upper() 
#             adu_range = adu[1][0:2]
#             print(label+':',*adu_range)
#             print(doc.char_span(*adu_range), "\n")
            
            
#         #print(i ,len(lis) ,lis[0]['ADU_text'], lis[0]['ADU_type'],"\n", lis[-1]['ADU_text'],lis[-1]['ADU_type'])

In [11]:
for i, u in enumerate(units):
        span_start = u[0].idx
        span_end = u[-1].idx  + len(u[-1])
        
        lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],row_ind)) for row_ind,row in adu24.iterrows() if span_start<= row['start_ind'] and span_end>= row['end_ind']]
        
        print(i,'--------------------------')
        print(lis,"\n")
        
        print("UNIT:",span_start, span_end)
        print(u,"\n")
        
        for adu in lis:
            
            #print(adu[1][2].upper()+':',adu[1][0:2])
            #print(doc.char_span(*adu[1][0:2]), "\n")
            label = adu[1][2].upper() 
            adu_range = adu[1][0:2]
            print(label+':',*adu_range)
            print(doc.char_span(*adu_range), "\n")
            
            
        #print(i ,len(lis) ,lis[0]['ADU_text'], lis[0]['ADU_type'],"\n", lis[-1]['ADU_text'],lis[-1]['ADU_type'])

0 --------------------------
[] 

UNIT: 0 59
Computer has negative effects to children

Nowadays, thanks 

1 --------------------------
[] 

UNIT: 9 62
has negative effects to children

Nowadays, thanks to 

2 --------------------------
[] 

UNIT: 13 66
negative effects to children

Nowadays, thanks to the 

3 --------------------------
[] 

UNIT: 22 78
effects to children

Nowadays, thanks to the development 

4 --------------------------
[] 

UNIT: 30 81
to children

Nowadays, thanks to the development of 

5 --------------------------
[] 

UNIT: 33 92
children

Nowadays, thanks to the development of technology 

6 --------------------------
[] 

UNIT: 41 93


Nowadays, thanks to the development of technology, 

7 --------------------------
[] 

UNIT: 43 102
Nowadays, thanks to the development of technology, computer 

8 --------------------------
[] 

UNIT: 51 105
, thanks to the development of technology, computer is 

9 --------------------------
[] 

UNIT: 53 109
thanks to the de

144 --------------------------
[] 

UNIT: 757 828
eventually bear dangerous consequences. For instance, several teenagers 

145 --------------------------
[] 

UNIT: 768 833
bear dangerous consequences. For instance, several teenagers play 

146 --------------------------
[] 

UNIT: 773 839
dangerous consequences. For instance, several teenagers play games 

147 --------------------------
[] 

UNIT: 783 847
consequences. For instance, several teenagers play games without 

148 --------------------------
[] 

UNIT: 795 852
. For instance, several teenagers play games without rest 

149 --------------------------
[] 

UNIT: 797 853
For instance, several teenagers play games without rest, 

150 --------------------------
[] 

UNIT: 801 859
instance, several teenagers play games without rest, which 

151 --------------------------
[] 

UNIT: 809 865
, several teenagers play games without rest, which leads 

152 --------------------------
[] 

UNIT: 811 868
several teenagers play games with

287 --------------------------
[((1480, 1530), (1480, 1530, 'Claim', 338))] 

UNIT: 1480 1530
it still has its bad side, especially for children 

CLAIM: 1480 1530
it still has its bad side, especially for children 

288 --------------------------
[] 

UNIT: 1483 1531
still has its bad side, especially for children. 

289 --------------------------
[] 

UNIT: 1489 1538
has its bad side, especially for children. People 

290 --------------------------
[] 

UNIT: 1493 1545
its bad side, especially for children. People should 

291 --------------------------
[] 

UNIT: 1497 1551
bad side, especially for children. People should learn 

292 --------------------------
[] 

UNIT: 1501 1555
side, especially for children. People should learn how 

293 --------------------------
[] 

UNIT: 1505 1558
, especially for children. People should learn how to 

294 --------------------------
[] 

UNIT: 1507 1562
especially for children. People should learn how to use 

295 --------------------------
[]

In [12]:
for i, u in enumerate(units):
    if u._.label:
        span_start = u[0].idx
        span_end = u[-1].idx  + len(u[-1])
        
        lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],row_ind)) for row_ind,row in adu24.iterrows() if span_start<= row['start_ind'] and span_end>= row['end_ind']]
        
        print(i,'--------------------------')
        print(lis,"\n")

59 --------------------------
[((311, 373), (311, 373, 'Claim', 340))] 

113 --------------------------
[((611, 663), (619, 663, 'Premise', 350))] 

114 --------------------------
[((617, 664), (619, 663, 'Premise', 350))] 

115 --------------------------
[((619, 667), (619, 663, 'Premise', 350))] 

241 --------------------------
[((1253, 1299), (1262, 1299, 'Premise', 348))] 

242 --------------------------
[((1260, 1307), (1262, 1299, 'Premise', 348))] 

243 --------------------------
[((1262, 1312), (1262, 1299, 'Premise', 348))] 

287 --------------------------
[((1480, 1530), (1480, 1530, 'Claim', 338))] 

