In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import re
import benepar

nlp = spacy.load('en_core_web_md')
nlp.add_pipe("benepar", config={"model": "benepar_en3"})



from itertools import chain

In [2]:
#from spacy import displacy
#import deplacy

In [3]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc']

# getters that are not used as features
span_utilities = ['prev_unit', 'idx_start', 'idx_end', ]
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr', 'get_label_and_error', 'get_label_clpr', 'get_label']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    def get_label_and_error(unit, error_function='percentage_correctness'):
        """
        Inputs: unit

        Outputs: label for the unit and segmentation error

        """

        def overlap_case(unit_start, unit_end, adu_start, adu_end):
            if adu_start >= unit_start and adu_end <= unit_end:
                # Case 1, ADU is fully contained in UNIT
                return 1

            elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

                # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
                return 2

            elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:

                # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
                return 3

            elif adu_start < unit_start and adu_end > unit_end:

                # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
                return 4

            else: 
                # ADU does not overlap with UNIT
                return False
            

        def percentage_correctness(unit, adu_start, adu_end, overlap_case):

            if overlap_case==2:
                adu_start = unit._.idx_start
            elif overlap_case==3:
                adu_end = unit._.idx_end
            elif overlap_case==4:
                adu_start = unit._.idx_start
                adu_end = unit._.idx_end

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            pct_correct = adu_ntokens/unit_ntokens
            return pct_correct

        def extended_accuracy(unit, adu_start, adu_end, overlap_case):
            # Compares number of tokens to get the the correct ADU in proportional with UNIT length

            if overlap_case==2:
                adu_start = unit._.idx_start
            if overlap_case==3:
                adu_end = unit._.idx_end
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            diff_ntokens = np.abs(unit_ntokens - adu_ntokens)

            return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))


        if error_function.lower() == 'percentage_correctness':
            err_func = percentage_correctness
        elif error_function.lower() == 'extended_accuracy':
            err_func = extended_accuracy
        
        unit_start = unit._.idx_start
        unit_end = unit._.idx_end

        essay_id = unit.doc._.essay_id

        # DataFrame containing ADUs indices & labels, filtered for current essay_id
        adus_doc = adus[adus['essay_id'] == essay_id]

        ### WORKING 09.02.2022#$$$$$$$$$$$$
        def segmentation_error(unit, adu_start, adu_end, overlap_case, error_function):
            
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            
            # positive value = too many tokens in segment, unit should be shorter (include less non-adu tokens)
            # negative value = too less tokens in segment, unit should be longer (include more adu tokens)
            
            left_tokens = adu.start - unit.start
            right_tokens = unit.end - adu.end
            
            if error_function.lower() == 'percentage_correctness':
                err_func = percentage_correctness
            elif error_function.lower() == 'extended_accuracy':
                err_func = extended_accuracy

            
            return (left_tokens, err_func(unit, adu_start, adu_end, overlap_case), right_tokens)
            
# v7 returns: (ADU_Type, (left_error_tokens, err_func, right_error_tokens))
        label_and_error = [(row['ADU_type'], segmentation_error(unit, row['start_ind'],row['end_ind'], 
                          overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), error_function),
                          #(row['start_ind'], row['end_ind'])
                           ) 
                         for row_ind, row in adus_doc.iterrows() 
                        # NOT SURE ABOUT <= or < SIGNS
                         if unit_start < row['end_ind'] and unit_end >= row['start_ind']]

            
# v6 returns: (ADU_Type, err_func)
#
#         label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
#                           overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind'])),
#                           #(row['start_ind'], row['end_ind'])
#                            ) 
#                          for row_ind, row in adus_doc.iterrows() 
#                          if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    #     # Contains information of the ADUs that overlap with the UNIT
    #     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
    #     overlap_adus = [(row['start_ind'],
    #                      row['end_ind'], 
    #                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
    #                      row['ADU_type']) 
    #                      for row_ind, row in adus_doc.iterrows()
    #           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

        return label_and_error

    def get_label_clpr(unit, label_mode='clpr', threshold=0):
        # DUPLICATE OF get_label
        error_tuple = unit._.get_label_and_error()

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label
    
    def get_label(unit, label_mode='clpr', threshold=0):
        error_tuple = unit._.get_label_and_error()

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label

    def _NOT_USED_get_label_adu(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()

    
    def get_idx_start(unit):
        return unit[0].idx
    
    def get_idx_end(unit):
        return unit[-1].idx  + len(unit[-1])
    
    def get_label_pct(span):
        
        
        pass
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
        
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]

    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)
    
    

    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        pass
    elif mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    elif mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    elif mode=='clause':
        # Code to segment by clause
        pass
    elif mode=='constituency1':
        # Take the first level subordinating conjunction (SBAR)
        # The first dependent clause
        units = []
        for sent in doc.sents:
            for node in sent._.constituents:

                if "SBAR" in node._.labels:

                    # Before SBAR
                    units.append(sent.doc[sent.start:node.start])
                    # SBAR
                    units.append(sent.doc[node.start:node.end])

                    # After SBAR
                    units.append(sent.doc[node.end:sent.end])

                    # Break out to take only the first SBAR we encounter
                    break
        
        units = [unit for unit in units if unit.text != '']
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units
        
    elif mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]
    elif mode=='gold_standard':
        
        # Segments ADUs according to annotations
        
        adu_inds = adus[adus['essay_id']==doc._.essay_id].sort_values('start_ind')[['start_ind','end_ind']]

        units = []

        start = 0
        for i, row in adu_inds.iterrows():

            # From previous adu end to current adu start (Non-ADU)
            end = row['start_ind']-1

            units.append(doc.char_span(start,end, alignment_mode='expand'))

            start = row['start_ind']
            end = row['end_ind']

            # From current adu start to current adu end
            units.append(doc.char_span(start,end,  alignment_mode='expand'))

            # set current adu end as start for next iteration
            start = row['end_ind']
        
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)


def calculate_segmentation_accuracy(units, error_function='percentage_correctness'):
    
    
    
    start_errors = np.array([])
    segmentation_accs = np.array([])
    end_errors = np.array([])

    for unit in units:
        error_tuple = unit._.get_label_and_error(error_function=error_function)

        if len(error_tuple) != 0:
            label_position = np.argmax([error[1] for label, error in error_tuple])

            start_errors = np.append(start_errors,error_tuple[label_position][1][0])

            segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])

            end_errors = np.append(end_errors, error_tuple[label_position][1][2])



    start_error = sum((start_errors**2))/len(start_errors)

    end_error = sum((end_errors**2))/len(end_errors)

    segmentation_acc = segmentation_accs.mean()
    
    return (start_error, segmentation_acc, end_error)




# Run
create_extensions(extensions_dict)   


In [4]:
start_errors = np.array([])
segmentation_accs = np.array([])
end_errors = np.array([])

for unit in units:
    error_tuple = unit._.get_label_and_error()

    if len(error_tuple) != 0:
        label_position = np.argmax([error[1] for label, error in error_tuple])
        
        print(error_tuple[label_position])
        print(unit._.idx_start, unit._.idx_end)
        start_errors = np.append(start_errors,error_tuple[label_position][1][0])
        
        segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])
        
        end_errors = np.append(end_errors, error_tuple[label_position][1][2])
        
        

start_error = sum((start_errors**2))/len(start_errors)

end_error = sum((end_errors**2))/len(end_errors)

segmentation_acc = segmentation_accs.mean()


adu24


NameError: name 'units' is not defined

In [5]:
units = segmentation(doc, mode='sentence')
calculate_segmentation_accuracy(units)




NameError: name 'doc' is not defined

In [6]:
units = segmentation(doc, mode='gold_standard')
calculate_segmentation_accuracy(units)

NameError: name 'doc' is not defined

In [7]:
units = segmentation(doc, mode='constituency1')
calculate_segmentation_accuracy(units)

NameError: name 'doc' is not defined

In [8]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [38]:
# FOR VIEWING ACCURACY ONLY
def all_docs(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None):
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
    return docs
    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    



In [50]:
essays = essays.sample(n=50, random_state=42)

essays

Unnamed: 0,essay_id,text,label
173,essay174,Serious or entertaining movies\n\nI prefer the...,train
116,essay117,Can technology alone solve the world's environ...,test
57,essay058,Competition or co-operation-which is better\n\...,train
225,essay226,Success and knowledge\n\nMany people think tha...,train
132,essay133,Nowadays human activities are influenced by co...,train
334,essay335,There have been significant developments in th...,test
42,essay043,"Sporting events easing international tensions,...",train
113,essay114,There will be soon no role for teachers in cla...,train
349,essay350,Internet will end the era of newspapers and ma...,train
77,essay078,Economic development vs environment\n\nCurrent...,train


In [51]:
docs = all_docs(essays)



In [34]:
#units_sentence = all_units(essays, segmentation_mode='sentence')
units_const = all_units(essays, segmentation_mode='constituency1')



In [37]:
calculate_segmentation_accuracy(units_const)

(44.14678899082569, 0.8876541818330964, 104.0137614678899)

In [12]:
# Pipelinev1

def text2fv(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None):
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
        
    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))

    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([unit._.get_label(label_mode=label_mode, threshold=threshold) for unit in units])

    return X,y 

In [39]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

###### TEST
in_text = essays.iloc[23].text
doc = nlp(in_text)
doc._.essay_id = essays.iloc[23]['essay_id']
adu24 = adus[adus['essay_id'] == doc._.essay_id]
units=segmentation(doc, mode='n_grams', n_grams=15)
units=segmentation(doc, mode='sentence')



In [21]:
# Utility, Delete Later
def print_adus(units):
    for i, u in enumerate(units):
            unit_start = u._.idx_start
            unit_end = u._.idx_end
            
            essay_id = u.doc._.essay_id
            
            
            doc_adus = adus[adus['essay_id'] == essay_id]
             
            lis = [((unit_start, unit_end),(row['start_ind'], row['end_ind'], row['ADU_type'],is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind']))) for row_ind,row in doc_adus.iterrows() if is_adu(unit_start, unit_end, row['start_ind'] ,row['end_ind'])]
            
            
            
            if len(lis)>0:
                print(i, lis)

def verbose_print(units):
    # Detailed Printer
    
    essay_id = units[0].doc._.essay_id
    adu_doc = adus[adus['essay_id']==essay_id]
    for i, u in enumerate(units):
            span_start = u[0].idx
            span_end = u[-1].idx  + len(u[-1])

            lis = [((span_start, span_end),(row['start_ind'], row['end_ind'], row['ADU_type'],
                                            is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])))
                   for row_ind,row in adu_doc.iterrows() if is_adu(span_start, span_end, row['start_ind'] ,row['end_ind'])]
            if len(lis)>0:
                print(i)
                print(lis,"\n")
                print("UNIT:",u,"\n")
                for ind, adu in enumerate(lis):

                    #print(adu[1][2].upper()+':',adu[1][0:2])
                    #print(doc.char_span(*adu[1][0:2]), "\n")
                    label = adu[1][2].upper() 
                    adu_range = adu[1][0:2]
                    adu_status = adu[1][3]
                    print(f'ADU #{ind+1}',label+':',*adu_range, adu_status)
                    print(doc.char_span(*adu_range), "\n")
                print("-----------------\n")
                
def is_adu(unit_start, unit_end, adu_start, adu_end):
    
    if adu_start<=unit_start and adu_end <=unit_start:
        # ADU comes before UNIT
        return False
    elif adu_start>=unit_end and adu_end >=unit_end:
        # ADU comes after UNIT
        return False
    else:
        if adu_start >= unit_start and adu_end <= unit_end:
            #print("Fully Contains ADU")
            return "Full"
        elif adu_start <= unit_start and adu_end <=unit_end:
            
            #print("ADU start is cut")
            return "Start_Cut"
        elif adu_start >= unit_start and adu_end >= unit_end:
            # End of ADU is after UNIT
            return "End_Cut"

        elif adu_start <= unit_start and adu_end >= unit_end:

            # UNIT is smaller than ADU, ADU start and end are cut
            return "Both_Sides_Cut"# Utility, Delete Later


In [22]:
units = segmentation(doc, mode='gold_standard')
# Coding Error Evaluation
start_errors = np.array([])
segmentation_accs = np.array([])
end_errors = np.array([])

for unit in units:
    error_tuple = unit._.get_label_and_error()

    if len(error_tuple) != 0:
        label_position = np.argmax([error[1] for label, error in error_tuple])
        
        print(error_tuple[label_position])
        start_errors = np.append(start_errors,error_tuple[label_position][1][0])
        
        segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])
        
        end_errors = np.append(end_errors, error_tuple[label_position][1][2])
        
        

start_error = sum((start_errors**2))/len(start_errors)

end_error = sum((end_errors**2))/len(end_errors)

segmentation_acc = segmentation_accs.mean()


('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Premise', (0, 1.0, 0))
('Claim', (0, 1.0, 0))
('Claim', (0, 1.0, 0))


In [23]:
# Coding constituency
doc
units = []

for sent in doc.sents:
    for node in sent._.constituents:

        if "SBAR" in node._.labels:
            
            # Before SBAR
            units.append(sent.doc[sent.start:node.start])
            # SBAR
            units.append(sent.doc[node.start:node.end])

            # After SBAR
            units.append(sent.doc[node.end:sent.end])
            
            # Break out to take only the first SBAR we encounter
            break
        
            



In [26]:
# Smaller set
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train)

X_test, y_test = text2fv(test)


# Classification

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [10]:
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)


LogisticRegression(solver='newton-cg')

In [11]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

         ADU       0.91      0.90      0.90        58
     Non-ADU       0.25      0.29      0.27         7

    accuracy                           0.83        65
   macro avg       0.58      0.59      0.59        65
weighted avg       0.84      0.83      0.84        65



# Two Binary Classifiers

In [12]:
# Smaller set + Cl1
essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train, segmentation_mode='sentence', label_mode='adu')

X_test, y_test = text2fv(train, segmentation_mode='sentence', label_mode='adu')


X_train_clpr, y_train_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')

X_test_clpr, y_test_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')


In [14]:
clpr_index_train = np.where(y_train_clpr!='Non-ADU')[0]

clpr_index_test = np.where(y_test_clpr!='Non-ADU')[0]

X_train_clpr_only = X_train[clpr_index_train].copy()
X_test_clpr_only = X_test[clpr_index_test].copy()


y_train_clpr_only = y_train_clpr[clpr_index_train].copy()

y_test_clpr_only = y_test_clpr[clpr_index_test].copy()

In [46]:
cl1 = LogisticRegression(solver='newton-cg')
cl1.fit(X_train, y_train)

LogisticRegression(solver='newton-cg')

In [47]:
cl2 = LogisticRegression(solver='newton-cg')
cl2.fit(X_train_clpr_only, y_train_clpr_only)

LogisticRegression(solver='newton-cg')

In [82]:
preds_cl1 = cl1.predict(X_test)

preds_cl1_adu_index = np.where(preds_cl1=='ADU')


X_test_cl1_pred_adu = X_test[preds_cl1_adu_index]
y_test_cl1_pred_adu = y_test[preds_cl1_adu_index]


preds_cl2 = cl2.predict(X_test_cl1_pred_adu)

preds_all = preds_cl1.copy()
preds_all[preds_cl1_adu_index] = preds_cl2
preds_all 

print(classification_report(preds_all, y_test_clpr))

              precision    recall  f1-score   support

       Claim       0.58      0.68      0.63       111
     Non-ADU       0.59      0.90      0.71        67
     Premise       0.85      0.67      0.75       256

    accuracy                           0.71       434
   macro avg       0.67      0.75      0.70       434
weighted avg       0.74      0.71      0.71       434



In [72]:
preds_all

array(['Premise', 'Claim', 'Premise', 'Premise', 'Premise', 'Premise',
       'Claim', 'Claim', 'Claim', 'Premise', 'Premise', 'Premise',
       'Claim', 'Claim', 'Claim', 'Claim', 'Premise', 'Premise',
       'Premise', 'Claim', 'Claim', 'Premise', 'Premise', 'Premise',
       'Premise', 'Claim', 'Claim', 'Premise', 'Claim', 'Claim',
       'Premise', 'Premise', 'Claim', 'Claim', 'Premise', 'Premise',
       'Premise', 'Claim', 'Premise', 'Claim', 'Premise', 'Premise',
       'Premise', 'Premise', 'Claim', 'Premise', 'Premise', 'Claim',
       'Premise', 'Claim', 'Premise', 'Premise', 'Claim', 'Premise',
       'Premise', 'Premise', 'Claim', 'Premise', 'Premise', 'Claim',
       'Premise', 'Premise', 'Premise', 'Claim', 'Premise', 'Premise',
       'Premise', 'Premise', 'Premise', 'Premise', 'Premise', 'Premise',
       'Premise', 'Premise', 'Premise', 'Premise', 'Claim', 'Premise',
       'Premise', 'Premise', 'Premise', 'Claim', 'Premise', 'Claim',
       'Premise', 'Premise', 'Prem

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [24]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
preds = rf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 168,   94],
       [  22, 1113]])

In [32]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74       262
           1       0.92      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.90      0.81      0.85      1397
weighted avg       0.92      0.92      0.91      1397



## CrossValidation

In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone

In [27]:
""" Stochastic Gradient Descent (SGD) classifier, 
This classifier has the advantage of being capable of handling very large datasets efficiently"""
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(random_state=42)

In [47]:
skfolds = StratifiedKFold(n_splits=5, random_state=42)

best_model = None 
precision = 0
for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    if precision < n_correct / len(y_pred):
        best_model = clone_clf
        precision = n_correct / len(y_pred)
    print(n_correct / len(y_pred))
    print(confusion_matrix(y_test_fold, y_pred))
    



0.7224231464737794
[[181  53]
 [254 618]]
0.8707052441229657
[[145  89]
 [ 54 818]]
0.8471971066907775
[[110 124]
 [ 45 827]]
0.8426763110307414
[[ 69 164]
 [ 10 863]]
0.8090497737556561
[[183  50]
 [161 711]]


In [48]:
preds = best_model.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 173,   89],
       [  81, 1054]])

In [50]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67       262
           1       0.92      0.93      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.80      0.79      0.80      1397
weighted avg       0.88      0.88      0.88      1397



In [78]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [91]:
svm_clf = LinearSVC(random_state=0, tol=1e-5, verbose=1, max_iter=50000)

In [92]:
svm_clf.fit(X_train, y_train)

[LibLinear]



LinearSVC(max_iter=50000, random_state=0, tol=1e-05, verbose=1)

In [95]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 127,  135],
       [  35, 1100]])

In [96]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.78      0.48      0.60       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.84      0.73      0.76      1397
weighted avg       0.87      0.88      0.87      1397



In [97]:
from sklearn import svm
svm_clf = svm.SVC(kernel='linear')


In [98]:
svm_clf.fit(X_train, y_train)

SVC(kernel='linear')

In [99]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 132,  130],
       [  32, 1103]])

In [100]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.80      0.50      0.62       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.85      0.74      0.78      1397
weighted avg       0.88      0.88      0.87      1397



### Hard Voting 

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [105]:
log_clf = LogisticRegression(solver='newton-cg')
rnd_clf = RandomForestClassifier()
smv_clf = SVC()

In [106]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', smv_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(solver='newton-cg')),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [108]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8869005010737294
RandomForestClassifier 0.9112383679312813
SVC 0.8840372226198998
VotingClassifier 0.9226914817465999


In [114]:
preds = voting_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 176,   86],
       [  22, 1113]])

In [115]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.67      0.77       262
           1       0.93      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.91      0.83      0.86      1397
weighted avg       0.92      0.92      0.92      1397



## Bagging and Pasting

In [109]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [110]:
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1)

In [111]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [112]:
preds = bag_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 179,   83],
       [  61, 1074]])

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       262
           1       0.93      0.95      0.94      1135

    accuracy                           0.90      1397
   macro avg       0.84      0.81      0.83      1397
weighted avg       0.89      0.90      0.90      1397

