In [230]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import re
import benepar

nlp = spacy.load('en_core_web_md')
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
nlp_trf = spacy.load('en_core_web_trf', disable=['tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

from itertools import chain

In [231]:
#from spacy import displacy
#import deplacy

In [249]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'sent_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc', 'num_claim_indicator', 'num_premise_indicator', 'has_question_mark', 'has_personal_pronoun',
                 'has_possessive_pronoun', 'has_modal_verb', 'is_first_token_gerund', 'tree_depth', 'contextual_features_prev' ,'contextual_features_next']

# getters that are not used as features
span_utilities = ['prev_unit', 'idx_start', 'idx_end', ]
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr', 'get_label_and_error', 'get_label', 'get_possible_labels']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    def get_possible_labels(unit, error_function='percentage_correctness'):
        """
        Inputs: unit

        Outputs: label for the unit and segmentation error

        """

        def overlap_case(unit_start, unit_end, adu_start, adu_end):
            if adu_start >= unit_start and adu_end <= unit_end:
                # Case 1, ADU is fully contained in UNIT
                return 1

            elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

                # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
                return 2

            elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:

                # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
                return 3

            elif adu_start < unit_start and adu_end > unit_end:

                # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
                return 4

            else: 
                # ADU does not overlap with UNIT
                return False
            

        def percentage_correctness(unit, adu_start, adu_end, overlap_case):

            if overlap_case==2:
                adu_start = unit._.idx_start
            elif overlap_case==3:
                adu_end = unit._.idx_end
            elif overlap_case==4:
                adu_start = unit._.idx_start
                adu_end = unit._.idx_end

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            pct_correct = adu_ntokens/unit_ntokens
            return pct_correct

        def extended_accuracy(unit, adu_start, adu_end, overlap_case):
            # Compares number of tokens to get the the correct ADU in proportional with UNIT length

            if overlap_case==2:
                adu_start = unit._.idx_start
            if overlap_case==3:
                adu_end = unit._.idx_end
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            diff_ntokens = np.abs(unit_ntokens - adu_ntokens)

            return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))


        if error_function.lower() == 'percentage_correctness':
            err_func = percentage_correctness
        elif error_function.lower() == 'extended_accuracy':
            err_func = extended_accuracy
        
        unit_start = unit._.idx_start
        unit_end = unit._.idx_end

        essay_id = unit.doc._.essay_id

        # DataFrame containing ADUs indices & labels, filtered for current essay_id
        adus_doc = adus[adus['essay_id'] == essay_id]

        def segmentation_error(unit, adu_start, adu_end, overlap_case, error_function):
            
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            
            # positive value = too many tokens in segment, unit should be shorter (include less non-adu tokens)
            # negative value = too less tokens in segment, unit should be longer (include more adu tokens)
            
            left_tokens = adu.start - unit.start
            right_tokens = unit.end - adu.end
            
            if error_function.lower() == 'percentage_correctness':
                err_func = percentage_correctness
            elif error_function.lower() == 'extended_accuracy':
                err_func = extended_accuracy

            
            return (left_tokens, err_func(unit, adu_start, adu_end, overlap_case), right_tokens)
            
# v7 returns: (ADU_Type, (left_error_tokens, err_func, right_error_tokens))
        label_and_error = [(row['ADU_type'], segmentation_error(unit, row['start_ind'],row['end_ind'], 
                          overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), error_function),
                          #(row['start_ind'], row['end_ind'])
                           ) 
                         for row_ind, row in adus_doc.iterrows() 
                         if unit_start < row['end_ind'] and unit_end >= row['start_ind'] ]

            
# v6 returns: (ADU_Type, err_func)
#
#         label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
#                           overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind'])),
#                           #(row['start_ind'], row['end_ind'])
#                            ) 
#                          for row_ind, row in adus_doc.iterrows() 
#                          if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    #     # Contains information of the ADUs that overlap with the UNIT
    #     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
    #     overlap_adus = [(row['start_ind'],
    #                      row['end_ind'], 
    #                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
    #                      row['ADU_type']) 
    #                      for row_ind, row in adus_doc.iterrows()
    #           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

        return label_and_error

    
    def get_label(unit, label_mode='clpr', threshold=0):
        error_tuple = unit._.get_possible_labels()

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label
        
    def get_label_and_error(unit, error_function='percentage_correctness', label_mode='clpr', threshold=0):
        error_tuple = unit._.get_possible_labels(error_function=error_function)

        if len(error_tuple) == 0:
            return ("Non-ADU", ())
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    assigned_label_and_error = (error_tuple[label_position][0], error_tuple[label_position][1])
                elif label_mode=='adu':
                    assigned_label_and_error = ('ADU', error_tuple[label_position][1])
                    
            else:
                assigned_label_and_error = ("Non-ADU", ())

            return assigned_label_and_error

    def _NOT_USED_get_label_adu(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()

    
    def get_idx_start(unit):
        return unit[0].idx
    
    def get_idx_end(unit):
        return unit[-1].idx  + len(unit[-1])
    
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        
        # PROBLEM! WORKS ONLY FOR SENTENCE SEGMENTATION
        
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
    
    def get_has_personal_pronoun(unit):
        
        return 'PRP' in [token.tag_ for token in unit]
    
    def get_has_possessive_pronoun(unit):
        
        return 'PRP$' in [token.tag_ for token in unit]     
    
    def get_has_modal_verb(unit):
        
        return 'MD' in [token.tag_ for token in unit]            
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_sent_emb(unit):
        
        trf_doc = nlp_trf(unit.text)
        return trf_doc._.trf_data.tensors[1][0]
        
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([1 for adv in conj_advs if adv in span.text.lower()])
    
        
    def get_num_claim_indicator(span):
        claim_indicators = ["accordingly", "as a result", "consequently", "conclude that", "clearly", "demonstrates that", "entails", "follows that", "hence", "however", "implies", "in fact", "in my opinion", "in short", "in conclusion", "indicates that", "it follows that", "it is highly probable that", "it is my contention", "it should be clear that", "I believe", "I mean", "I think", "must be that", "on the contrary", "points to the conclusions", "proves that", "shows that", "so", "suggests that", "the most obvious explanation", "the point I’m trying to make", "therefore", "thus", "the truth of the matter", "to sum up", "we may deduce"]
        
        return sum([1 for c_indicator in claim_indicators if c_indicator in span.text.lower()])
    
    def get_num_premise_indicator(span):
        premise_indicators=["after all", "assuming that", "as", "as indicated by", "as shown", "besides", "because", "deduced", "derived from", "due to", "firstly", "follows from", "for", "for example", "for instance", "for one thing", "for the reason that", "furthermore", "given that", "in addition", "in light of", "in that", "in view of", "in view of the fact that", "indicated by", "is supported by", "may be inferred", "moreover", "owing to", "researchers found that", "secondly", "this can be seen from", "since", "since the evidence is", "what’s more", "whereas",]
        return sum([1 for p_indicator in premise_indicators if p_indicator in span.text.lower()])
    
    def get_is_first_token_gerund(span):
        
        return span[0].tag_ =='VBG'
    
    def get_has_question_mark(span):
        return '?' in span.text

    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    
    def get_tree_depth(unit):
        depths = {}

        def walk_tree(node, depth):
            depths[node.orth_] = depth
            if node.n_lefts + node.n_rights > 0:
                return [walk_tree(child, depth + 1) for child in node.children]

        walk_tree(unit.root, 0)
        return max(depths.values())
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]

    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)

    def get_contextual_features_prev(unit):
        contextual_features_names=['num_tokens','num_verbs','num_pos_pronouns','num_conj_adv','num_punct','is_para_start','num_claim_indicator','num_premise_indicator','has_question_mark','has_personal_pronoun','has_possessive_pronoun','has_modal_verb','is_first_token_gerund','tree_depth']
        
        contextual_features = np.array([])
        for feature in contextual_features_names:
            if unit._.index_in_doc==0:
                contextual_features = np.append(contextual_features,0)
            else:
                contextual_features = np.append(contextual_features, unit._.prev_unit._.get(feature))
        return contextual_features

    def get_contextual_features_next(unit):
        contextual_features_names=['num_tokens','num_verbs','num_pos_pronouns','num_conj_adv','num_punct','is_para_start','num_claim_indicator','num_premise_indicator','has_question_mark','has_personal_pronoun','has_possessive_pronoun','has_modal_verb','is_first_token_gerund','tree_depth']
        
        contextual_features = np.array([])

        try:
            next_unit = unit._.get_nth_unit(unit._.index_in_doc + 1)
        except:
            return [0 for feature in contextual_features_names]
        else:
            return [next_unit._.get(feature) for feature in contextual_features_names]

            
    
    
    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        pass
    elif mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    elif mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    elif mode=='clause':
        # Code to segment by clause
        pass
    elif mode=='constituency1':
        # Take the first level subordinating conjunction (SBAR)
        # The first dependent clause
        units = []
        for sent in doc.sents:
            for node in sent._.constituents:

                if "SBAR" in node._.labels:

                    # Before SBAR
                    units.append(sent.doc[sent.start:node.start])
                    # SBAR
                    units.append(sent.doc[node.start:node.end])

                    # After SBAR
                    units.append(sent.doc[node.end:sent.end])

                    # Break out to take only the first SBAR we encounter
                    break
        
        units = [unit for unit in units if unit.text != '']
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units
        
    elif mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]
    elif mode=='gold_standard':
        
        # Segments ADUs according to annotations
        
        adu_inds = adus[adus['essay_id']==doc._.essay_id].sort_values('start_ind')[['start_ind','end_ind']]

        units = []

        start = 0
        for i, row in adu_inds.iterrows():

            # From previous adu end to current adu start (Non-ADU)
            end = row['start_ind']-1

            units.append(doc.char_span(start,end, alignment_mode='expand'))

            start = row['start_ind']
            end = row['end_ind']

            # From current adu start to current adu end
            units.append(doc.char_span(start,end,  alignment_mode='expand'))

            # set current adu end as start for next iteration
            start = row['end_ind']
        
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)


def calculate_segmentation_accuracy(units, error_function='percentage_correctness'):
    
    
    
    start_errors = np.array([])
    segmentation_accs = np.array([])
    end_errors = np.array([])

    for unit in units:
        error_tuple = unit._.get_possible_labels(error_function=error_function)

        if len(error_tuple) != 0:
            label_position = np.argmax([error[1] for label, error in error_tuple])

            start_errors = np.append(start_errors,error_tuple[label_position][1][0])

            segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])

            end_errors = np.append(end_errors, error_tuple[label_position][1][2])



    start_error = sum((start_errors**2))/len(start_errors)

    end_error = sum((end_errors**2))/len(end_errors)

    segmentation_acc = segmentation_accs.mean()
    
    return (start_error, segmentation_acc, end_error)




# Run
create_extensions(extensions_dict)   


In [250]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [251]:
# Pipelinev1

def text2fv(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None ,print_segmentation_error = False):
    
    
    
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    
    if segmentation_mode != "constituency1":
    
        for doc, context in nlp.pipe(data, as_tuples=True, disable=['benepar']):
            doc._.essay_id = context['id']
            docs.append(doc)
    
    else:
        
        for doc, context in nlp.pipe(data, as_tuples=True):
            doc._.essay_id = context['id']
            docs.append(doc)

    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    if print_segmentation_error:
        print(f"Segmentation Mode: {segmentation_mode}\nAccuracy:{calculate_segmentation_accuracy(units)}")

    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([unit._.get_label(label_mode=label_mode, threshold=threshold) for unit in units])

    return X,y 

In [252]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

###### TEST
in_text = essays.iloc[23].text
doc = nlp(in_text)
doc._.essay_id = essays.iloc[23]['essay_id']
adu24 = adus[adus['essay_id'] == doc._.essay_id]
units=segmentation(doc, mode='n_grams', n_grams=15)
units=segmentation(doc, mode='sentence')



In [253]:
# Utility, Delete Later
def print_adus(units):
    for i, u in enumerate(units):
            unit_start = u._.idx_start
            unit_end = u._.idx_end
            
            essay_id = u.doc._.essay_id
            
            
            doc_adus = adus[adus['essay_id'] == essay_id]
             
            lis = [((unit_start, unit_end),(row['start_ind'], row['end_ind'], row['ADU_type'],is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind']))) for row_ind,row in doc_adus.iterrows() if is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind'])]
            
            
            
            if len(lis)>0:
                print(i, lis)

def verbose_print(units):
    # Detailed Printer
    
    essay_id = units[0].doc._.essay_id
    adu_doc = adus[adus['essay_id']==essay_id]
    for i, u in enumerate(units):
            span_start = u[0].idx
            span_end = u[-1].idx  + len(u[-1])

            lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],
                                            is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])))
                   for row_ind,row in adu_doc.iterrows() if is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])]
            if len(lis)>0:
                print(i)
                print(lis,"\n")
                print("UNIT:",u,"\n")
                for ind, adu in enumerate(lis):

                    #print(adu[1][2].upper()+':',adu[1][0:2])
                    #print(doc.char_span(*adu[1][0:2]), "\n")
                    label = adu[1][2].upper() 
                    adu_range = adu[1][0:2]
                    adu_status = adu[1][3]
                    print(f'ADU #{ind+1}',label+':',*adu_range, adu_status)
                    print(doc.char_span(*adu_range), "\n")
                print("-----------------\n")
                
def is_adu(unit_start, unit_end, adu_start, adu_end):
    
    if adu_start<=unit_start and adu_end <=unit_start:
        # ADU comes before UNIT
        return False
    elif adu_start>=unit_end and adu_end >=unit_end:
        # ADU comes after UNIT
        return False
    else:
        if adu_start >= unit_start and adu_end <= unit_end:
            #print("Fully Contains ADU")
            return "Full"
        elif adu_start <= unit_start and adu_end <=unit_end:
            
            #print("ADU start is cut")
            return "Start_Cut"
        elif adu_start >= unit_start and adu_end >= unit_end:
            # End of ADU is after UNIT
            return "End_Cut"

        elif adu_start <= unit_start and adu_end >= unit_end:

            # UNIT is smaller than ADU, ADU start and end are cut
            return "Both_Sides_Cut"# Utility, Delete Later
        
# FOR VIEWING ACCURACY ONLY
def all_docs(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None):
    # TEMP
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
    return docs
    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    



In [254]:
units = segmentation(doc, mode='gold_standard')
# Coding Error Evaluation
start_errors = np.array([])
segmentation_accs = np.array([])
end_errors = np.array([])

for unit in units:
    error_tuple = unit._.get_possible_labels()

    if len(error_tuple) != 0:
        label_position = np.argmax([error[1] for label, error in error_tuple])
        
        print(error_tuple[label_position])
        start_errors = np.append(start_errors,error_tuple[label_position][1][0])
        
        segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])
        
        end_errors = np.append(end_errors, error_tuple[label_position][1][2])
        
        

start_error = sum((start_errors**2))/len(start_errors)

end_error = sum((end_errors**2))/len(end_errors)

segmentation_acc = segmentation_accs.mean()


('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Claim', (0, 1.0, 0))


In [284]:
# Smaller set
essays = pd.read_csv('../data/output_csv/essays.csv')
essays= essays[:10].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train)


X_test, y_test = text2fv(test)


# Classification

In [167]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [188]:
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train, segmentation_mode='gold_standard' ,label_mode='clpr')

essays = pd.read_csv('../data/output_csv/essays.csv')
essays[essays['label'] =='test']
X_test, y_test = text2fv(essays[essays['label'] =='test'], segmentation_mode='gold_standard' ,label_mode='clpr')

In [285]:
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)


LogisticRegression(solver='newton-cg')

In [286]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

         ADU       0.95      0.93      0.94        40
     Non-ADU       0.62      0.71      0.67         7

    accuracy                           0.89        47
   macro avg       0.79      0.82      0.80        47
weighted avg       0.90      0.89      0.90        47



# Multiclass Classifier

In [182]:
# Smaller set

# Sentence segmentation
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train, segmentation_mode='sentence', label_mode='clpr')

X_test, y_test = text2fv(essays[essays['label'] =='test'], segmentation_mode='sentence', label_mode='clpr')



In [295]:
X_train, y_train = text2fv(train, segmentation_mode='sentence', n_grams=15, label_mode='clpr')

In [187]:

for segmentation_mode in ['sentence', 'n_grams', 'constituency1', 'gold_standard']:

    X_train, y_train = text2fv(train, segmentation_mode=segmentation_mode, n_grams=15, label_mode='clpr', print_segmentation_error=True)

    X_test, y_test = text2fv(essays[essays['label'] =='test'], segmentation_mode=segmentation_mode, n_grams=15, label_mode='clpr', print_segmentation_error=True)

    logreg = LogisticRegression(solver='newton-cg')
    logreg.fit(X_train, y_train)
    preds_lr = logreg.predict(X_test)
    print(f"LOGISTIC REG --- Segmentation: {segmentation_mode}")
    print(classification_report(y_test, preds_lr))
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    print(f"RANDOM FOREST --- Segmentation: {segmentation_mode}")
    print(classification_report(y_test, preds_rf))

Segmentation Mode: sentence
Accuracy:(36.126506024096386, 0.7912428572123295, 7.710843373493976)
Segmentation Mode: sentence
Accuracy:(40.905179982440735, 0.7782862047912608, 11.576821773485513)
LOGISTIC REG --- Segmentation: sentence
              precision    recall  f1-score   support

       Claim       0.58      0.52      0.55       407
     Non-ADU       0.57      0.69      0.62       258
     Premise       0.75      0.74      0.74       732

    accuracy                           0.67      1397
   macro avg       0.63      0.65      0.64      1397
weighted avg       0.67      0.67      0.67      1397

RANDOM FOREST --- Segmentation: sentence
              precision    recall  f1-score   support

       Claim       0.53      0.28      0.37       407
     Non-ADU       0.78      0.55      0.65       258
     Premise       0.67      0.92      0.77       732

    accuracy                           0.66      1397
   macro avg       0.66      0.58      0.60      1397
weighted avg     



LOGISTIC REG --- Segmentation: n_grams
              precision    recall  f1-score   support

       Claim       0.43      0.40      0.42      9297
     Non-ADU       0.50      0.47      0.48      4520
     Premise       0.68      0.72      0.70     15999

    accuracy                           0.58     29816
   macro avg       0.54      0.53      0.53     29816
weighted avg       0.57      0.58      0.58     29816

RANDOM FOREST --- Segmentation: n_grams
              precision    recall  f1-score   support

       Claim       0.48      0.32      0.38      9297
     Non-ADU       0.73      0.51      0.60      4520
     Premise       0.70      0.89      0.78     15999

    accuracy                           0.66     29816
   macro avg       0.64      0.57      0.59     29816
weighted avg       0.63      0.66      0.63     29816





Segmentation Mode: constituency1
Accuracy:(46.22102425876011, 0.8914464929863491, 107.11320754716981)


KeyboardInterrupt: 

In [254]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)
print(classification_report(y_test, preds_rf))

              precision    recall  f1-score   support

       Claim       0.77      0.43      0.56        23
     Non-ADU       0.98      1.00      0.99        60
     Premise       0.72      0.89      0.80        37

    accuracy                           0.86       120
   macro avg       0.82      0.78      0.78       120
weighted avg       0.86      0.86      0.85       120



In [None]:
# Smaller set

# Sentence segmentation
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train, segmentation_mode='n_grams', n_grams=15, label_mode='clpr')

X_test, y_test = text2fv(test, segmentation_mode='sentence', label_mode='clpr')


# Two Binary Classifiers

In [71]:
# Smaller set + Cl1
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train, segmentation_mode='sentence', label_mode='adu')

X_test, y_test = text2fv(train, segmentation_mode='sentence', label_mode='adu')


X_train_clpr, y_train_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')

X_test_clpr, y_test_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')


In [72]:
clpr_index_train = np.where(y_train_clpr!='Non-ADU')[0]

clpr_index_test = np.where(y_test_clpr!='Non-ADU')[0]

X_train_clpr_only = X_train[clpr_index_train].copy()
X_test_clpr_only = X_test[clpr_index_test].copy()


y_train_clpr_only = y_train_clpr[clpr_index_train].copy()

y_test_clpr_only = y_test_clpr[clpr_index_test].copy()

In [39]:
cl1 = LogisticRegression(solver='newton-cg')
cl1.fit(X_train, y_train)

LogisticRegression(solver='newton-cg')

In [40]:
cl2 = LogisticRegression(solver='newton-cg')
cl2.fit(X_train_clpr_only, y_train_clpr_only)

LogisticRegression(solver='newton-cg')

In [41]:
preds_cl1 = cl1.predict(X_test)

preds_cl1_adu_index = np.where(preds_cl1=='ADU')


X_test_cl1_pred_adu = X_test[preds_cl1_adu_index]
y_test_cl1_pred_adu = y_test[preds_cl1_adu_index]


preds_cl2 = cl2.predict(X_test_cl1_pred_adu)

preds_all = preds_cl1.copy()
preds_all[preds_cl1_adu_index] = preds_cl2
preds_all 

print(classification_report(preds_all, y_test_clpr))

              precision    recall  f1-score   support

       Claim       0.58      0.67      0.62       112
     Non-ADU       0.56      0.88      0.68        59
     Premise       0.84      0.67      0.74       254

    accuracy                           0.70       425
   macro avg       0.66      0.74      0.68       425
weighted avg       0.73      0.70      0.70       425



In [43]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [24]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
preds = rf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 168,   94],
       [  22, 1113]])

In [32]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74       262
           1       0.92      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.90      0.81      0.85      1397
weighted avg       0.92      0.92      0.91      1397



## CrossValidation

In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone

In [27]:
""" Stochastic Gradient Descent (SGD) classifier, 
This classifier has the advantage of being capable of handling very large datasets efficiently"""
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(random_state=42)

In [47]:
skfolds = StratifiedKFold(n_splits=5, random_state=42)

best_model = None 
precision = 0
for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    if precision < n_correct / len(y_pred):
        best_model = clone_clf
        precision = n_correct / len(y_pred)
    print(n_correct / len(y_pred))
    print(confusion_matrix(y_test_fold, y_pred))
    



0.7224231464737794
[[181  53]
 [254 618]]
0.8707052441229657
[[145  89]
 [ 54 818]]
0.8471971066907775
[[110 124]
 [ 45 827]]
0.8426763110307414
[[ 69 164]
 [ 10 863]]
0.8090497737556561
[[183  50]
 [161 711]]


In [48]:
preds = best_model.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 173,   89],
       [  81, 1054]])

In [50]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67       262
           1       0.92      0.93      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.80      0.79      0.80      1397
weighted avg       0.88      0.88      0.88      1397



In [78]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [91]:
svm_clf = LinearSVC(random_state=0, tol=1e-5, verbose=1, max_iter=50000)

In [92]:
svm_clf.fit(X_train, y_train)

[LibLinear]



LinearSVC(max_iter=50000, random_state=0, tol=1e-05, verbose=1)

In [95]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 127,  135],
       [  35, 1100]])

In [96]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.78      0.48      0.60       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.84      0.73      0.76      1397
weighted avg       0.87      0.88      0.87      1397



In [97]:
from sklearn import svm
svm_clf = svm.SVC(kernel='linear')


In [98]:
svm_clf.fit(X_train, y_train)

SVC(kernel='linear')

In [99]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 132,  130],
       [  32, 1103]])

In [100]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.80      0.50      0.62       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.85      0.74      0.78      1397
weighted avg       0.88      0.88      0.87      1397



### Hard Voting 

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [105]:
log_clf = LogisticRegression(solver='newton-cg')
rnd_clf = RandomForestClassifier()
smv_clf = SVC()

In [106]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', smv_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(solver='newton-cg')),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [108]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8869005010737294
RandomForestClassifier 0.9112383679312813
SVC 0.8840372226198998
VotingClassifier 0.9226914817465999


In [114]:
preds = voting_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 176,   86],
       [  22, 1113]])

In [115]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.67      0.77       262
           1       0.93      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.91      0.83      0.86      1397
weighted avg       0.92      0.92      0.92      1397



## Bagging and Pasting

In [109]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [110]:
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1)

In [111]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [112]:
preds = bag_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 179,   83],
       [  61, 1074]])

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       262
           1       0.93      0.95      0.94      1135

    accuracy                           0.90      1397
   macro avg       0.84      0.81      0.83      1397
weighted avg       0.89      0.90      0.90      1397

