In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import re
import benepar
from itertools import chain
from spacy.pipeline import Sentencizer

nlp = spacy.load('en_core_web_md')
nlp.add_pipe("benepar", config={"model": "benepar_en3"})
nlp_trf = spacy.load('en_core_web_trf', disable=['tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
para_splitter = Sentencizer(punct_chars=['\n'])



In [2]:
#from spacy import displacy
#import deplacy

In [74]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'sent_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc', 'num_claim_indicator', 'num_premise_indicator', 'has_question_mark', 'has_personal_pronoun',
                 'has_possessive_pronoun', 'has_modal_verb', 'is_first_token_gerund', 'tree_depth', 'contextual_features_prev' ,'contextual_features_next']

# getters that are not used as features
span_utilities = ['prev_unit', 'idx_start', 'idx_end', ]
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr', 'get_label_and_error', 'get_label', 'get_possible_labels']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    def get_possible_labels(unit, error_function='percentage_correctness'):
        """
        Inputs: unit

        Outputs: label for the unit and segmentation error

        """

        def overlap_case(unit_start, unit_end, adu_start, adu_end):
            if adu_start >= unit_start and adu_end <= unit_end:
                # Case 1, ADU is fully contained in UNIT
                return 1

            elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

                # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
                return 2

            elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:

                # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
                return 3

            elif adu_start < unit_start and adu_end > unit_end:

                # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
                return 4

            else: 
                # ADU does not overlap with UNIT
                return False
            

        def percentage_correctness(unit, adu_start, adu_end, overlap_case):

            if overlap_case==2:
                adu_start = unit._.idx_start
            elif overlap_case==3:
                adu_end = unit._.idx_end
            elif overlap_case==4:
                adu_start = unit._.idx_start
                adu_end = unit._.idx_end

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            pct_correct = adu_ntokens/unit_ntokens
            return pct_correct

        def extended_accuracy(unit, adu_start, adu_end, overlap_case):
            # Compares number of tokens to get the the correct ADU in proportional with UNIT length

            if overlap_case==2:
                adu_start = unit._.idx_start
            if overlap_case==3:
                adu_end = unit._.idx_end
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            diff_ntokens = np.abs(unit_ntokens - adu_ntokens)

            return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))


        if error_function.lower() == 'percentage_correctness':
            err_func = percentage_correctness
        elif error_function.lower() == 'extended_accuracy':
            err_func = extended_accuracy
        
        unit_start = unit._.idx_start
        unit_end = unit._.idx_end

        essay_id = unit.doc._.essay_id

        # DataFrame containing ADUs indices & labels, filtered for current essay_id
        adus_doc = adus[adus['essay_id'] == essay_id]

        def segmentation_error(unit, adu_start, adu_end, overlap_case, error_function):
            
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            
            # positive value = too many tokens in segment, unit should be shorter (include less non-adu tokens)
            # negative value = too less tokens in segment, unit should be longer (include more adu tokens)
            
            left_tokens = adu.start - unit.start
            right_tokens = unit.end - adu.end
            
            if error_function.lower() == 'percentage_correctness':
                err_func = percentage_correctness
            elif error_function.lower() == 'extended_accuracy':
                err_func = extended_accuracy

            
            return (left_tokens, err_func(unit, adu_start, adu_end, overlap_case), right_tokens)
            
# v7 returns: (ADU_Type, (left_error_tokens, err_func, right_error_tokens))
        label_and_error = [(row['ADU_type'], segmentation_error(unit, row['start_ind'],row['end_ind'], 
                          overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), error_function),
                          #(row['start_ind'], row['end_ind'])
                           ) 
                         for row_ind, row in adus_doc.iterrows() 
                         if unit_start < row['end_ind'] and unit_end >= row['start_ind'] ]

            
# v6 returns: (ADU_Type, err_func)
#
#         label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
#                           overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind'])),
#                           #(row['start_ind'], row['end_ind'])
#                            ) 
#                          for row_ind, row in adus_doc.iterrows() 
#                          if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    #     # Contains information of the ADUs that overlap with the UNIT
    #     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
    #     overlap_adus = [(row['start_ind'],
    #                      row['end_ind'], 
    #                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
    #                      row['ADU_type']) 
    #                      for row_ind, row in adus_doc.iterrows()
    #           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

        return label_and_error

    
    def get_label(unit, label_mode='clpr', threshold=0, error_function='percentage_correctness'):
        error_tuple = unit._.get_possible_labels(error_function=error_function)

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label
        
    def get_label_and_error(unit, error_function='percentage_correctness', label_mode='clpr', threshold=0):
        error_tuple = unit._.get_possible_labels(error_function=error_function)

        if len(error_tuple) == 0:
            return ("Non-ADU", ())
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    assigned_label_and_error = (error_tuple[label_position][0], error_tuple[label_position][1])
                elif label_mode=='adu':
                    assigned_label_and_error = ('ADU', error_tuple[label_position][1])
                    
            else:
                assigned_label_and_error = ("Non-ADU", ())

            return assigned_label_and_error

    def _NOT_USED_get_label_adu(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()

    
    def get_idx_start(unit):
        return unit[0].idx
    
    def get_idx_end(unit):
        return unit[-1].idx  + len(unit[-1])
    
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        
        # PROBLEM! WORKS ONLY FOR SENTENCE SEGMENTATION
        
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
    
    def get_has_personal_pronoun(unit):
        
        return 'PRP' in [token.tag_ for token in unit]
    
    def get_has_possessive_pronoun(unit):
        
        return 'PRP$' in [token.tag_ for token in unit]     
    
    def get_has_modal_verb(unit):
        
        return 'MD' in [token.tag_ for token in unit]            
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_sent_emb(unit):
        
        trf_doc = nlp_trf(unit.text)
        return trf_doc._.trf_data.tensors[1][0]
        
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([1 for adv in conj_advs if adv in span.text.lower()])
    
        
    def get_num_claim_indicator(span):
        claim_indicators = ["accordingly", "as a result", "consequently", "conclude that", "clearly", "demonstrates that", "entails", "follows that", "hence", "however", "implies", "in fact", "in my opinion", "in short", "in conclusion", "indicates that", "it follows that", "it is highly probable that", "it is my contention", "it should be clear that", "I believe", "I mean", "I think", "must be that", "on the contrary", "points to the conclusions", "proves that", "shows that", "so", "suggests that", "the most obvious explanation", "the point I’m trying to make", "therefore", "thus", "the truth of the matter", "to sum up", "we may deduce"]
        
        return sum([1 for c_indicator in claim_indicators if c_indicator in span.text.lower()])
    
    def get_num_premise_indicator(span):
        premise_indicators=["after all", "assuming that", "as", "as indicated by", "as shown", "besides", "because", "deduced", "derived from", "due to", "firstly", "follows from", "for", "for example", "for instance", "for one thing", "for the reason that", "furthermore", "given that", "in addition", "in light of", "in that", "in view of", "in view of the fact that", "indicated by", "is supported by", "may be inferred", "moreover", "owing to", "researchers found that", "secondly", "this can be seen from", "since", "since the evidence is", "what’s more", "whereas",]
        return sum([1 for p_indicator in premise_indicators if p_indicator in span.text.lower()])
    
    def get_is_first_token_gerund(span):
        
        return span[0].tag_ =='VBG'
    
    def get_has_question_mark(span):
        return '?' in span.text

    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    
    def get_tree_depth(unit):
        depths = {}

        def walk_tree(node, depth):
            depths[node.orth_] = depth
            if node.n_lefts + node.n_rights > 0:
                return [walk_tree(child, depth + 1) for child in node.children]

        walk_tree(unit.root, 0)
        return max(depths.values())
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]

    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)

    def get_contextual_features_prev(unit):
        contextual_features_names=['num_tokens','num_verbs','num_pos_pronouns','num_conj_adv','num_punct','is_para_start','num_claim_indicator','num_premise_indicator','has_question_mark','has_personal_pronoun','has_possessive_pronoun','has_modal_verb','is_first_token_gerund','tree_depth']
        
        contextual_features = np.array([])
        for feature in contextual_features_names:
            if unit._.index_in_doc==0:
                contextual_features = np.append(contextual_features,0)
            else:
                contextual_features = np.append(contextual_features, unit._.prev_unit._.get(feature))
        return contextual_features

    def get_contextual_features_next(unit):
        contextual_features_names=['num_tokens','num_verbs','num_pos_pronouns','num_conj_adv','num_punct','is_para_start','num_claim_indicator','num_premise_indicator','has_question_mark','has_personal_pronoun','has_possessive_pronoun','has_modal_verb','is_first_token_gerund','tree_depth']
        
        contextual_features = np.array([])

        try:
            next_unit = unit._.get_nth_unit(unit._.index_in_doc + 1)
        except:
            return [0 for feature in contextual_features_names]
        else:
            return [next_unit._.get(feature) for feature in contextual_features_names]

            
    
    
    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        with nlp.select_pipes(disable=nlp.pipe_names):
            para_doc = para_splitter(nlp(doc.text))
            p_units = list(para_doc.sents)
            doc._.units_index_list = [(unit.start, unit.end) for unit in p_units]
            
            units = [doc[start:end] for start, end in doc._.units_index_list]
            
            return units
            
    elif mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    elif mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    elif mode=='clause':
        # Code to segment by clause
        pass
    elif mode=='constituency1':
        # Take the first level subordinating conjunction (SBAR)
        # The first dependent clause
        units = []
        for sent in doc.sents:
            for node in sent._.constituents:

                if "SBAR" in node._.labels:

                    # Before SBAR
                    units.append(sent.doc[sent.start:node.start])
                    # SBAR
                    units.append(sent.doc[node.start:node.end])

                    # After SBAR
                    units.append(sent.doc[node.end:sent.end])

                    # Break out to take only the first SBAR we encounter
                    break
        
        units = [unit for unit in units if unit.text != '']
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units
        
    elif mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]
    elif mode=='gold_standard':
        
        # Segments ADUs according to annotations
        
        adu_inds = adus[adus['essay_id']==doc._.essay_id].sort_values('start_ind')[['start_ind','end_ind']]

        units = []

        start = 0
        for i, row in adu_inds.iterrows():

            # From previous adu end to current adu start (Non-ADU)
            end = row['start_ind']-1

            units.append(doc.char_span(start,end, alignment_mode='expand'))

            start = row['start_ind']
            end = row['end_ind']

            # From current adu start to current adu end
            units.append(doc.char_span(start,end,  alignment_mode='expand'))

            # set current adu end as start for next iteration
            start = row['end_ind']
        
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)


def OLD_calculate_segmentation_accuracy(units, error_function='percentage_correctness'):
    
    
    
    start_errors = np.array([])
    segmentation_accs = np.array([])
    end_errors = np.array([])

    for unit in units:
        error_tuple = unit._.get_possible_labels(error_function=error_function)

        if len(error_tuple) != 0:
            label_position = np.argmax([error[1] for label, error in error_tuple])

            start_errors = np.append(start_errors,error_tuple[label_position][1][0])

            segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])

            end_errors = np.append(end_errors, error_tuple[label_position][1][2])



    start_error = sum((start_errors**2))/len(start_errors)

    end_error = sum((end_errors**2))/len(end_errors)

    segmentation_acc = segmentation_accs.mean()
    
    return (start_error, segmentation_acc, end_error)



def calculate_segmentation_accuracy(units, error_function='percentage_correctness'):
    
    
    
    start_errors = np.array([])
    segmentation_accs = np.array([])
    end_errors = np.array([])
    early_start_errors = np.array([])
    late_start_errors = np.array([])
    early_end_errors = np.array([])
    late_end_errors = np.array([])
    for unit in units:
        error_tuple = unit._.get_label_and_error(error_function=error_function)[1]
        
        if len(error_tuple) != 0:
            
            if error_tuple[0] < 0:
                late_start_errors = np.append(late_start_errors, error_tuple[0])
            elif error_tuple[0] > 0:
                early_start_errors = np.append(early_start_errors, error_tuple[0])
            
            segmentation_accs = np.append(segmentation_accs, error_tuple[1])
            
            if error_tuple[2] < 0:
                early_end_errors = np.append(early_end_errors, error_tuple[2])
            elif error_tuple[2] > 0:
                late_end_errors = np.append(late_end_errors, error_tuple[2])
            end_errors = np.append(end_errors, error_tuple[2])





#     start_error = sum((start_errors**2))/len(start_errors)

#     end_error = sum((end_errors**2))/len(end_errors)

#     segmentation_acc = segmentation_accs.mean()
    
    error_vector = dict(start_early = early_start_errors, start_late = late_start_errors, segmentation_accs = segmentation_accs,
                 end_early = early_end_errors, end_late = late_end_errors)
    
    error_means = dict(start_early = early_start_errors.mean(), start_late = late_start_errors.mean(),
                   segmentation_accs = segmentation_accs.mean(),end_early = early_end_errors.mean(),
                   end_late = late_end_errors.mean())
    
    return error_vector, error_means




# Run
create_extensions(extensions_dict)   


In [10]:
error_vector_dict, error_mean_dict  = calculate_segmentation_accuracy(units)


  error_means = dict(start_early = early_start_errors.mean(), start_late = late_start_errors.mean(),
  ret = ret.dtype.type(ret / rcount)
  segmentation_accs = segmentation_accs.mean(),end_early = early_end_errors.mean(),


In [11]:
for k,v in error_vector_dict.items():
    
    print(list(v))

[3.0, 2.0, 8.0, 16.0, 3.0, 14.0, 4.0]
[]
[0.7142857142857143, 0.9642857142857143, 0.52, 0.64, 0.5405405405405406, 0.8260869565217391, 0.9655172413793104, 0.5161290322580645, 0.42857142857142855]
[]
[1.0, 1.0, 10.0, 1.0, 1.0, 1.0, 1.0, 1.0, 12.0]


In [12]:
string = "[-19.0, -12.0, -5.0, -5.0, -17.0, -10.0, -23.0]"

string.strip('[]').split(',')

['-19.0', ' -12.0', ' -5.0', ' -5.0', ' -17.0', ' -10.0', ' -23.0']

In [13]:
essays[essays['label'] =='test']

Unnamed: 0,essay_id,text,label
3,essay004,International tourism is now more common than ...,test
4,essay005,Living and studying overseas\n\nIt is every st...,test
5,essay006,Studies abroad and the cultural aspect of the ...,test
20,essay021,Advertisements affects on consumer goods\n\nEv...,test
41,essay042,Paying more money is the only motivation to ma...,test
...,...,...,...
372,essay373,Capital punishment; 51% countries have polishe...,test
381,essay382,Technology helps student learn more informatio...,test
385,essay386,Classmates' impacts on children's performance ...,test
392,essay393,Detailed description of crimes on newspaper an...,test


In [14]:
for k,v in error_vector_dict.items():
    print(f"{k}: {v}")
    
    

start_early: [ 3.  2.  8. 16.  3. 14.  4.]
start_late: []
segmentation_accs: [0.71428571 0.96428571 0.52       0.64       0.54054054 0.82608696
 0.96551724 0.51612903 0.42857143]
end_early: []
end_late: [ 1.  1. 10.  1.  1.  1.  1.  1. 12.]


In [15]:
print(error_vector_dict)

{'start_early': array([ 3.,  2.,  8., 16.,  3., 14.,  4.]), 'start_late': array([], dtype=float64), 'segmentation_accs': array([0.71428571, 0.96428571, 0.52      , 0.64      , 0.54054054,
       0.82608696, 0.96551724, 0.51612903, 0.42857143]), 'end_early': array([], dtype=float64), 'end_late': array([ 1.,  1., 10.,  1.,  1.,  1.,  1.,  1., 12.])}


In [87]:
labeled_units_pct

[[Computer has negative effects to children
  
  Nowadays, thanks to the development of technology,
  has negative effects to children
  
  Nowadays, thanks to the development of technology,,
  negative effects to children
  
  Nowadays, thanks to the development of technology, computer,
  effects to children
  
  Nowadays, thanks to the development of technology, computer is,
  to children
  
  Nowadays, thanks to the development of technology, computer is now,
  children
  
  Nowadays, thanks to the development of technology, computer is now indispensable,
  
  
  Nowadays, thanks to the development of technology, computer is now indispensable to,
  Nowadays, thanks to the development of technology, computer is now indispensable to life,
  , thanks to the development of technology, computer is now indispensable to life.,
  thanks to the development of technology, computer is now indispensable to life. Some,
  to the development of technology, computer is now indispensable to life. So

In [91]:
labeled_units_pct

[[Computer has negative effects to children
  
  Nowadays, thanks to the development of technology,
  has negative effects to children
  
  Nowadays, thanks to the development of technology,,
  negative effects to children
  
  Nowadays, thanks to the development of technology, computer,
  effects to children
  
  Nowadays, thanks to the development of technology, computer is,
  to children
  
  Nowadays, thanks to the development of technology, computer is now,
  children
  
  Nowadays, thanks to the development of technology, computer is now indispensable,
  
  
  Nowadays, thanks to the development of technology, computer is now indispensable to,
  Nowadays, thanks to the development of technology, computer is now indispensable to life,
  , thanks to the development of technology, computer is now indispensable to life.,
  thanks to the development of technology, computer is now indispensable to life. Some,
  to the development of technology, computer is now indispensable to life. So

In [18]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [19]:
# Pipelinev1

def text2fv(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=15 ,print_segmentation_error = False):
    
    
    
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    
    if segmentation_mode != "constituency1":
    
        for doc, context in nlp.pipe(data, as_tuples=True, disable=['benepar']):
            doc._.essay_id = context['id']
            docs.append(doc)
    
    else:
        
        for doc, context in nlp.pipe(data, as_tuples=True):
            doc._.essay_id = context['id']
            docs.append(doc)

    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    if print_segmentation_error:
        print(f"Segmentation Mode: {segmentation_mode}\nAccuracy:{calculate_segmentation_accuracy(units)}")

    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    #y = np.array([unit._.get_label(label_mode=label_mode, threshold=threshold) for unit in units])
    y_adu = np.array([unit._.get_label(label_mode='adu', threshold=threshold) for unit in units])
    y_clpr = np.array([unit._.get_label(label_mode='clpr', threshold=threshold) for unit in units])
    
    return X, y_adu, y_clpr

In [20]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

###### TEST
in_text = essays.iloc[23].text
doc = nlp(in_text)
doc._.essay_id = essays.iloc[23]['essay_id']
adu24 = adus[adus['essay_id'] == doc._.essay_id]
units=segmentation(doc, mode='n_grams', n_grams=15)
units=segmentation(doc, mode='sentence')



In [21]:
# Utility, Delete Later
def print_adus(units):
    for i, u in enumerate(units):
            unit_start = u._.idx_start
            unit_end = u._.idx_end
            
            essay_id = u.doc._.essay_id
            
            
            doc_adus = adus[adus['essay_id'] == essay_id]
             
            lis = [((unit_start, unit_end),(row['start_ind'], row['end_ind'], row['ADU_type'],is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind']))) for row_ind,row in doc_adus.iterrows() if is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind'])]
            
            
            
            if len(lis)>0:
                print(i, lis)

def verbose_print(units):
    # Detailed Printer
    
    essay_id = units[0].doc._.essay_id
    adu_doc = adus[adus['essay_id']==essay_id]
    for i, u in enumerate(units):
            span_start = u[0].idx
            span_end = u[-1].idx  + len(u[-1])

            lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],
                                            is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])))
                   for row_ind,row in adu_doc.iterrows() if is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])]
            if len(lis)>0:
                print(i)
                print(lis,"\n")
                print("UNIT:",u,"\n")
                for ind, adu in enumerate(lis):

                    #print(adu[1][2].upper()+':',adu[1][0:2])
                    #print(doc.char_span(*adu[1][0:2]), "\n")
                    label = adu[1][2].upper() 
                    adu_range = adu[1][0:2]
                    adu_status = adu[1][3]
                    print(f'ADU #{ind+1}',label+':',*adu_range, adu_status)
                    print(doc.char_span(*adu_range), "\n")
                print("-----------------\n")
                
def is_adu(unit_start, unit_end, adu_start, adu_end):
    
    if adu_start<=unit_start and adu_end <=unit_start:
        # ADU comes before UNIT
        return False
    elif adu_start>=unit_end and adu_end >=unit_end:
        # ADU comes after UNIT
        return False
    else:
        if adu_start >= unit_start and adu_end <= unit_end:
            #print("Fully Contains ADU")
            return "Full"
        elif adu_start <= unit_start and adu_end <=unit_end:
            
            #print("ADU start is cut")
            return "Start_Cut"
        elif adu_start >= unit_start and adu_end >= unit_end:
            # End of ADU is after UNIT
            return "End_Cut"

        elif adu_start <= unit_start and adu_end >= unit_end:

            # UNIT is smaller than ADU, ADU start and end are cut
            return "Both_Sides_Cut"# Utility, Delete Later
        
# FOR VIEWING ACCURACY ONLY
def all_docs(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None):
    # TEMP
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
    return docs
    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    



In [22]:
units = segmentation(doc, mode='gold_standard')
# Coding Error Evaluation
start_errors = np.array([])
segmentation_accs = np.array([])
end_errors = np.array([])

for unit in units:
    error_tuple = unit._.get_possible_labels()

    if len(error_tuple) != 0:
        label_position = np.argmax([error[1] for label, error in error_tuple])
        
        print(error_tuple[label_position])
        start_errors = np.append(start_errors,error_tuple[label_position][1][0])
        
        segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])
        
        end_errors = np.append(end_errors, error_tuple[label_position][1][2])
        
        

start_error = sum((start_errors**2))/len(start_errors)

end_error = sum((end_errors**2))/len(end_errors)

segmentation_acc = segmentation_accs.mean()


('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Claim', (0, 1.0, 0))


In [23]:
essays = pd.read_csv('../data/output_csv/essays.csv')


n = 100
split_pct = 0.8
essays = essays.sample(n)
train = essays.sample(frac=split_pct)
test = essays.drop(train.index)

In [None]:
# Smaller set
essays = pd.read_csv('../data/output_csv/essays.csv')
essays= essays[:50].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

essays = pd.read_csv('../data/output_csv/essays.csv')


n = 5
split_pct = 0.7
essays = essays.sample(n)
train = essays.sample(frac=split_pct)
test = essays.drop(train.index)

X_train, y_train_adu, y_train_clpr = text2fv(train)


X_test, y_test_adu, y_test_clpr = text2fv(test)


In [None]:
# Smaller set
essays = pd.read_csv('../data/output_csv/essays.csv')
essays= essays[:50].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

essays = pd.read_csv('../data/output_csv/essays.csv')


n = 200
split_pct = 0.7
essays = essays.sample(n)
train = essays.sample(frac=split_pct)
test = essays.drop(train.index)

X_train_c, y_train_adu_c, y_train_clpr_c = text2fv(train, segmentation_mode='constituency1')


X_test_c, y_test_adu_c, y_test_clpr_c = text2fv(test, segmentation_mode='constituency1')

# Classification

In [46]:
##Sklearn Models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [103]:
#Creating the file 
import datetime



st = datetime.datetime.now().strftime('%Y-%m-%d-%Hh%Mm%Ss')

f = open(f'Research-Case-Results-{st}.txt', 'w')
f.write('Research Case Result Report \n')


# INPUTS 
essays = pd.read_csv('../data/output_csv/essays.csv')
adus = pd.read_csv("../data/output_csv/adus.csv")


n = 50
split_pct = 0.6
essays = essays.sample(n)
train = essays.sample(frac=split_pct)
test = essays.drop(train.index)




classifiers = ['LR', 'NB']
#classifiers = ['LR', 'NB', 'XGB', 'SVM', 'RF']

# TODO Have to add this...
error_funcs = ['percentage_correctness', 'extended_accuracy']

#segmentations = ['sentence', 'paragraph', 'n_grams', 'clause', 'constituency1', 'token', 'gold_standard']
#segmentations = ['sentence', 'constituency1', 'gold_standard']
segmentations = ['paragraph',]


                 
#classifications = ['binary', 'multiclass', 'two_binary']
classifications = ['binary', 'multiclass', 'two_binary']

f.write(f'\n We are using {n} essays \n')


f.write(f'\n Classifiers are using {classifiers} \n')
f.write(f'\n Segmentations are using {segmentations} \n')
f.write(f'\n Classifications are using {classifications} \n')


68

In [104]:



def pipeline():
    
    f.write(f'\n Start Pipeline \n')
    
    for segmentation in segmentations:
        print(segmentation)
        
        f.write(f'\n Segmentantion Type: {segmentation} \n')
        
        segmentation_mode(segmentation)
        
        
    
def segmentation_mode(segmentation):
    
    if segmentation == 'paragraph':
        
        X_train, y_train_adu, y_train_clpr = text2fv(train, segmentation_mode= segmentation)
        X_test, y_test_adu, y_test_clpr = text2fv(test, segmentation_mode= segmentation)
        
        for classification in classifications:
            print(classification)
            
            f.write(f'\n Classification Type: {classification} \n')
            
            classification_type(classification, 
                                X_train, y_train_adu,y_train_clpr, 
                                X_test, y_test_adu, y_test_clpr)
    
    if segmentation == 'sentence':
        
        X_train, y_train_adu, y_train_clpr = text2fv(train, segmentation_mode= segmentation)
        X_test, y_test_adu, y_test_clpr = text2fv(test, segmentation_mode= segmentation)
        
        for classification in classifications:
            print(classification)
            
            f.write(f'\n Classification Type: {classification} \n')
            
            classification_type(classification, 
                                X_train, y_train_adu,y_train_clpr, 
                                X_test, y_test_adu, y_test_clpr)
    
    if segmentation  == 'constituency1':
          
        X_train, y_train_adu, y_train_clpr = text2fv(train, segmentation_mode= segmentation)
        X_test, y_test_adu, y_test_clpr = text2fv(test, segmentation_mode= segmentation)
     
        for classification in classifications:
            print(classification)
            
            f.write(f'\n Classification Type: {classification} \n')
            
            classification_type(classification, 
                                X_train, y_train_adu,y_train_clpr, 
                                X_test, y_test_adu, y_test_clpr)


    if segmentation == 'n_grams':

        X_train, y_train_adu, y_train_clpr = text2fv(train, segmentation_mode= segmentation, n_grams=15)
        X_test, y_test_adu, y_test_clpr = text2fv(test, segmentation_mode= segmentation, n_grams=15)
     
        for classification in classifications:
            print(classification)
            
            f.write(f'\n Classification Type: {classification} \n')
            classification_type(classification, 
                                X_train, y_train_adu,y_train_clpr, 
                                X_test, y_test_adu, y_test_clpr)
    
    if segmentation == 'gold_standard':

        X_train, y_train_adu, y_train_clpr = text2fv(train, segmentation_mode= segmentation)
        X_test, y_test_adu, y_test_clpr = text2fv(test, segmentation_mode= segmentation)
     
        for classification in classifications:
            print(classification)
            
            f.write(f'\n Classification Type: {classification} \n')
            classification_type(classification, 
                                X_train, y_train_adu,y_train_clpr, 
                                X_test, y_test_adu, y_test_clpr)

            

def classification_type(classification, X_train, y_train_adu, y_train_clpr, X_test, y_test_adu, y_test_clpr):
    
    if classification == 'binary':
        
        for classifier in classifiers:
            train_test_classifer(classifier, X_train, y_train_adu, X_test, y_test_adu)
    
    if classification  == 'multiclass':

        for classifier in classifiers:
            train_test_classifer(classifier, X_train, y_train_clpr, X_test, y_test_clpr)


    if classification == 'two_binary':

        for classifier in classifiers:
            two_binary_classification(classifier,  X_train, y_train_adu, y_train_clpr, X_test, y_test_adu, y_test_clpr)
            

def two_binary_classification(first_classifier, X_train, y_train_adu, y_train_clpr, X_test, y_test_adu, y_test_clpr):
    
    second_classifiers = ['LR']
    
    print(f'First Classier: {first_classifier}')
    f.write(f'\n First Classier: {first_classifier} \n')
    
    cl1 = train_test_classifer(first_classifier, 
                               X_train, y_train_adu, 
                               X_test, y_test_adu, True)
    for cli_1 in cl1:
        preds_cl1 = cli_1.predict(X_test)
        preds_cl1_adu_index = np.where(preds_cl1=='ADU')

        X_test_cl1_pred_adu = X_test[preds_cl1_adu_index]
        y_test_cl1_pred_adu = y_test_adu[preds_cl1_adu_index]


        for second_classifier in second_classifiers:
            print(f'Second Classier: {second_classifier}')
            
            f.write(f'\n Second Classier: {second_classifier} \n')
        
            clpr_index_train = np.where(y_train_clpr!='Non-ADU')[0]
            clpr_index_test = np.where(y_test_clpr!='Non-ADU')[0]

            X_train_clpr_only = X_train[clpr_index_train].copy()
            X_test_clpr_only = X_test[clpr_index_test].copy()


            y_train_clpr_only = y_train_clpr[clpr_index_train].copy()

            y_test_clpr_only = y_test_clpr[clpr_index_test].copy()

            cl2 = train_test_classifer(second_classifier, X_train_clpr_only,  y_train_clpr_only, X_test, y_test_clpr, True)

            for cli_2 in cl2:

                preds_cl2 = cli_2.predict(X_test_cl1_pred_adu)

                preds_all = preds_cl1.copy()
                preds_all[preds_cl1_adu_index] = preds_cl2
                preds_all 

                print(classification_report(preds_all, y_test_clpr))

                f.write(f'\n Classification Report: {classification_report(preds_all, y_test_clpr)} \n')

def train_test_classifer(classifier, X_train, y_train, X_test, y_test, multiclass = False):

    if classifier == 'LR':
        print("logistic_regression")
        f.write(f'\n Model Logistic Regression: \n')
        return logistic_regression(X_train, y_train, X_test, y_test, multiclass)

    if classifier == 'RF':
        print("random_forest")
        f.write(f'\n Model Random Forest: \n')
        return random_forest(X_train, y_train, X_test, y_test, multiclass)

    if classifier == 'NB':
        print("naive_bayes")
        f.write(f'\n Model Naive Bayes: \n')
        return naive_bayes(X_train, y_train, X_test, y_test, multiclass)

    if classifier == 'XGB':
        print("xgboost")
        f.write(f'\n Model xgboost: \n')
        return xgboost(X_train, y_train, X_test, y_test, multiclass)

    if classifier == 'SVM':
        print("svm")
        f.write(f'\n Model SVM: \n')
        return svm(X_train, y_train, X_test, y_test, multiclass)
            

def logistic_regression(X_train, y_train, X_test, y_test, multiclass = False):
    
    logreg = LogisticRegression(solver='newton-cg')
    logreg.fit(X_train, y_train)
    
    logreg_models = []
    if multiclass:
         logreg_models.append(logreg)
    
    else: 
        preds_lr = logreg.predict(X_test)
        print(classification_report(y_test, preds_lr))
        f.write(f'\n Classification Report: \n {classification_report(y_test, preds_lr)} \n')


    
    ####################################################
    # parameter grid
    parameters = {
        'penalty' : ['l2'], 
        'C'       : np.logspace(-3,3,7),
        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    }
    
    clf = GridSearchCV(LogisticRegression(), 
                       param_grid=parameters,
                       scoring='accuracy',
                       cv = 10)
    
    clf.fit(X_train,y_train)
    
    
    if multiclass:
        logreg_models.append(clf) 
        return logreg_models

    else: 
        print("Tuned Hyperparameters: ", clf.best_params_)
        print("Accuracy: ", clf.best_score_)

        preds_lr = clf.predict(X_test)
        print(classification_report(y_test, preds_lr))
        f.write(f'\n Tuned Hyperparameters: {clf.best_params_} \n')
        f.write(f'\n Accuracy: {clf.best_score_} \n')
        f.write(f'\n Classification Report: \n {classification_report(y_test, preds_lr)} \n')

def random_forest(X_train, y_train, X_test, y_test, multiclass = False):
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    
    rf_models = []
    if multiclass:
        rf_models.append(rf)
        
    else:
        preds_rf = rf.predict(X_test)
        print(classification_report(y_test, preds_rf))
        f.write(f'\n Classification Report: \n {classification_report(y_test, preds_rf)} \n')
    
    ####################################################
    # parameter grid
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    #print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    if multiclass:
        rf_models.append(rf_random)
        return rf_models
        
    else:
        print("Tuned Hyperparameters: ", rf_random.best_params_)
        print("Accuracy: ", rf_random.best_score_)

        preds_rf_random = rf_random.predict(X_test)
        print(classification_report(y_test, preds_rf_random))
        
        f.write(f'\n Tuned Hyperparameters: {rf_random.best_params_} \n')
        f.write(f'\n Accuracy: {rf_random.best_score_} \n')
        f.write(f'\n Classification Report: \n {classification_report(y_test, preds_rf_random)} \n')

def naive_bayes(X_train, y_train, X_test, y_test, multiclass = False):
    
    #NB doesn't have any hyperparameters to tune.
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    
    gnb_models = []
    if multiclass:
        gnb_models.append(gnb)
        return gnb_models
    
    else:
        preds_gnb = gnb.predict(X_test)
        print(classification_report(y_test, preds_gnb))
        f.write(f'\n Classification Report: \n {classification_report(y_test, preds_gnb)} \n')

def xgboost(X_train, y_train, X_test, y_test, multiclass = False):
   
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train)
    
    xgb_models = []
    if multiclass:
        xgb_models.append(xgb_model)
    
    else:
        y_pred = xgb_model.predict(X_test) 
        print(classification_report(y_test, y_pred))
        f.write(f'\n Classification Report: \n {classification_report(y_test, y_pred)} \n')
    
    ####################################################
    # parameter grid
    param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    }

    # Init classifier
    xgb_cl = XGBClassifier()
    # Init Grid Search
    grid_xgb = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3)
    xgb_grid = grid_xgb.fit(X_train, y_train)
    

    if multiclass:
        xgb_models.append(xgb_grid)
        return xgb_models
        
    else:
        print("Tuned Hyperparameters: ", xgb_grid.best_params_)
        print("Accuracy: ", xgb_grid.best_score_)

        y_pred = xgb_grid.predict(X_test)
        print(classification_report(y_test, y_pred))
        
        f.write(f'\n Tuned Hyperparameters: {xgb_grid.best_params_} \n')
        f.write(f'\n Accuracy: {xgb_grid.best_score_} \n')
        f.write(f'\n Classification Report: \n {classification_report(y_test, y_pred)} \n')



def svm(X_train, y_train, X_test, y_test, multiclass = False):
    
    clf = SVC()
    clf.fit(X_train, y_train)
   
    svc_models = []
    if multiclass:
        svc_models.append(clf)
   
    else:
        y_pred = clf.predict(X_test)
        print(classification_report(y_test, y_pred))
        f.write(f'\n Classification Report: \n {classification_report(y_test, y_pred)} \n')
    
    ####################################################
    # parameter grid
    # defining parameter range
    param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

    svm_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

    # fitting the model for grid search
    svm_grid.fit(X_train, y_train)

    if multiclass:
        svc_models.append(svm_grid)
        return svc_models
        
    else:
        print("Tuned Hyperparameters: ", svm_grid.best_params_)
        print("Accuracy: ", svm_grid.best_score_)
        
        y_pred = svm_grid.predict(X_test)
        print(classification_report(y_test, y_pred))
        
        f.write(f'\n Tuned Hyperparameters: {svm_grid.best_params_} \n')
        f.write(f'\n Accuracy: {svm_grid.best_score_} \n')
        f.write(f'\n Classification Report: \n {classification_report(y_test, y_pred)} \n')
        
    
  

In [105]:
pipeline()
f.close()

paragraph
binary
logistic_regression
              precision    recall  f1-score   support

         ADU       0.98      0.99      0.98        88
     Non-ADU       0.00      0.00      0.00         2

    accuracy                           0.97        90
   macro avg       0.49      0.49      0.49        90
weighted avg       0.96      0.97      0.96        90

Tuned Hyperparameters:  {'C': 1000.0, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy:  0.9785714285714284
              precision    recall  f1-score   support

         ADU       0.98      0.99      0.98        88
     Non-ADU       0.00      0.00      0.00         2

    accuracy                           0.97        90
   macro avg       0.49      0.49      0.49        90
weighted avg       0.96      0.97      0.96        90

naive_bayes
              precision    recall  f1-score   support

         ADU       0.98      0.99      0.98        88
     Non-ADU       0.00      0.00      0.00         2

    accuracy             