In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import re
#import benepar

nlp = spacy.load('en_core_web_md')
#nlp.add_pipe("benepar", config={"model": "benepar_en3"})



from itertools import chain

In [2]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc', 'num_claim_indicator', 'num_premise_indicator', 'has_question_mark', 'has_personal_pronoun',
                 'has_possessive_pronoun', 'has_modal_verb', 'is_first_token_gerund', 'tree_depth']

# getters that are not used as features
span_utilities = ['prev_unit', 'idx_start', 'idx_end', ]
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr', 'get_label_and_error', 'get_label_clpr', 'get_label']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    def get_label_and_error(unit, error_function='percentage_correctness'):
        """
        Inputs: unit

        Outputs: label for the unit and segmentation error

        """

        def overlap_case(unit_start, unit_end, adu_start, adu_end):
            if adu_start >= unit_start and adu_end <= unit_end:
                # Case 1, ADU is fully contained in UNIT
                return 1

            elif adu_start <= unit_start and adu_end <=unit_end and adu_end>=unit_start:

                # Case 2, ADU starts before UNIT, start(Left) of ADU is cut
                return 2

            elif adu_start >= unit_start and adu_end >= unit_end and adu_start<unit_end:

                # Case 3, ADU starts after UNIT, end(Right) of ADU is cut
                return 3

            elif adu_start < unit_start and adu_end > unit_end:

                # Case 4, ADU starts before UNIT and ends after UNIT, both sides of ADU are cut
                return 4

            else: 
                # ADU does not overlap with UNIT
                return False
            

        def percentage_correctness(unit, adu_start, adu_end, overlap_case):

            if overlap_case==2:
                adu_start = unit._.idx_start
            elif overlap_case==3:
                adu_end = unit._.idx_end
            elif overlap_case==4:
                adu_start = unit._.idx_start
                adu_end = unit._.idx_end

            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            pct_correct = adu_ntokens/unit_ntokens
            return pct_correct

        def extended_accuracy(unit, adu_start, adu_end, overlap_case):
            # Compares number of tokens to get the the correct ADU in proportional with UNIT length

            if overlap_case==2:
                adu_start = unit._.idx_start
            if overlap_case==3:
                adu_end = unit._.idx_end
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')

            unit_ntokens = len(unit)
            adu_ntokens = len(adu)
            diff_ntokens = np.abs(unit_ntokens - adu_ntokens)

            return 1/((diff_ntokens+1)**(np.log2(diff_ntokens+1)/np.log2(unit_ntokens+1)))


        if error_function.lower() == 'percentage_correctness':
            err_func = percentage_correctness
        elif error_function.lower() == 'extended_accuracy':
            err_func = extended_accuracy
        
        unit_start = unit._.idx_start
        unit_end = unit._.idx_end

        essay_id = unit.doc._.essay_id

        # DataFrame containing ADUs indices & labels, filtered for current essay_id
        adus_doc = adus[adus['essay_id'] == essay_id]

        ### WORKING 09.02.2022#$$$$$$$$$$$$
        def segmentation_error(unit, adu_start, adu_end, overlap_case, error_function):
            
            adu = unit.doc.char_span(adu_start, adu_end, alignment_mode='expand')
            
            # positive value = too many tokens in segment, unit should be shorter (include less non-adu tokens)
            # negative value = too less tokens in segment, unit should be longer (include more adu tokens)
            
            left_tokens = adu.start - unit.start
            right_tokens = unit.end - adu.end
            
            if error_function.lower() == 'percentage_correctness':
                err_func = percentage_correctness
            elif error_function.lower() == 'extended_accuracy':
                err_func = extended_accuracy

            
            return (left_tokens, err_func(unit, adu_start, adu_end, overlap_case), right_tokens)
            
# v7 returns: (ADU_Type, (left_error_tokens, err_func, right_error_tokens))
        label_and_error = [(row['ADU_type'], segmentation_error(unit, row['start_ind'],row['end_ind'], 
                          overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), error_function),
                          #(row['start_ind'], row['end_ind'])
                           ) 
                         for row_ind, row in adus_doc.iterrows() 
                        # NOT SURE ABOUT <= or < SIGNS
                         if unit_start < row['end_ind'] and unit_end >= row['start_ind']]

            
# v6 returns: (ADU_Type, err_func)
#
#         label_and_error = [(row['ADU_type'], err_func(unit, row['start_ind'],row['end_ind'], 
#                           overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind'])),
#                           #(row['start_ind'], row['end_ind'])
#                            ) 
#                          for row_ind, row in adus_doc.iterrows() 
#                          if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

    #     # Contains information of the ADUs that overlap with the UNIT
    #     # Structure: (adu_start, adu_end, overlap_case, ADU_type)
    #     overlap_adus = [(row['start_ind'],
    #                      row['end_ind'], 
    #                      overlap_case(unit_start, unit_end,row['start_ind'], row['end_ind']), 
    #                      row['ADU_type']) 
    #                      for row_ind, row in adus_doc.iterrows()
    #           if unit_start <= row['end_ind'] and unit_end >= row['start_ind']]

        return label_and_error

    def get_label_clpr(unit, label_mode='clpr', threshold=0):
        # DUPLICATE OF get_label
        error_tuple = unit._.get_label_and_error()

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label
    
    def get_label(unit, label_mode='clpr', threshold=0):
        error_tuple = unit._.get_label_and_error()

        if len(error_tuple) == 0:
            return "Non-ADU"
        else:
            # Get position of label with maximum accuracy
            label_position = np.argmax([error[1] for label, error in error_tuple])
            if error_tuple[label_position][1][1] > threshold:
                if label_mode=='clpr':
                    label = error_tuple[label_position][0]
                elif label_mode=='adu':
                    label = 'ADU'
                    
            else:
                label = "Non-ADU"

            return label

    def _NOT_USED_get_label_adu(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()

    
    def get_idx_start(unit):
        return unit[0].idx
    
    def get_idx_end(unit):
        return unit[-1].idx  + len(unit[-1])
    
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        
        # PROBLEM! WORKS ONLY FOR SENTENCE SEGMENTATION
        
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
    
    def get_has_personal_pronoun(unit):
        
        return 'PRP' in [token.tag_ for token in unit]
    
    def get_has_possessive_pronoun(unit):
        
        return 'PRP$' in [token.tag_ for token in unit]     
    
    def get_has_modal_verb(unit):
        
        return 'MD' in [token.tag_ for token in unit]            
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([1 for adv in conj_advs if adv in span.text.lower()])
    
        
    def get_num_claim_indicator(span):
        claim_indicators = ["accordingly", "as a result", "consequently", "conclude that", "clearly", "demonstrates that", "entails", "follows that", "hence", "however", "implies", "in fact", "in my opinion", "in short", "in conclusion", "indicates that", "it follows that", "it is highly probable that", "it is my contention", "it should be clear that", "I believe", "I mean", "I think", "must be that", "on the contrary", "points to the conclusions", "proves that", "shows that", "so", "suggests that", "the most obvious explanation", "the point I’m trying to make", "therefore", "thus", "the truth of the matter", "to sum up", "we may deduce"]
        
        return sum([1 for c_indicator in claim_indicators if c_indicator in span.text.lower()])
    
    def get_num_premise_indicator(span):
        premise_indicators=["after all", "assuming that", "as", "as indicated by", "as shown", "besides", "because", "deduced", "derived from", "due to", "firstly", "follows from", "for", "for example", "for instance", "for one thing", "for the reason that", "furthermore", "given that", "in addition", "in light of", "in that", "in view of", "in view of the fact that", "indicated by", "is supported by", "may be inferred", "moreover", "owing to", "researchers found that", "secondly", "this can be seen from", "since", "since the evidence is", "what’s more", "whereas",]
        return sum([1 for p_indicator in premise_indicators if p_indicator in span.text.lower()])
    
    def get_is_first_token_gerund(span):
        
        return span[0].tag_ =='VBG'
    
    def get_has_question_mark(span):
        return '?' in span.text

    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    
    def get_tree_depth(unit):
        depths = {}

        def walk_tree(node, depth):
            depths[node.orth_] = depth
            if node.n_lefts + node.n_rights > 0:
                return [walk_tree(child, depth + 1) for child in node.children]

        walk_tree(unit.root, 0)
        return max(depths.values())
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]

    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)

        

    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        pass
    elif mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    elif mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    elif mode=='clause':
        # Code to segment by clause
        pass
    elif mode=='constituency1':
        # Take the first level subordinating conjunction (SBAR)
        # The first dependent clause
        units = []
        for sent in doc.sents:
            for node in sent._.constituents:

                if "SBAR" in node._.labels:

                    # Before SBAR
                    units.append(sent.doc[sent.start:node.start])
                    # SBAR
                    units.append(sent.doc[node.start:node.end])

                    # After SBAR
                    units.append(sent.doc[node.end:sent.end])

                    # Break out to take only the first SBAR we encounter
                    break
        
        units = [unit for unit in units if unit.text != '']
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units
        
    elif mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]
    elif mode=='gold_standard':
        
        # Segments ADUs according to annotations
        
        adu_inds = adus[adus['essay_id']==doc._.essay_id].sort_values('start_ind')[['start_ind','end_ind']]

        units = []

        start = 0
        for i, row in adu_inds.iterrows():

            # From previous adu end to current adu start (Non-ADU)
            end = row['start_ind']-1

            units.append(doc.char_span(start,end, alignment_mode='expand'))

            start = row['start_ind']
            end = row['end_ind']

            # From current adu start to current adu end
            units.append(doc.char_span(start,end,  alignment_mode='expand'))

            # set current adu end as start for next iteration
            start = row['end_ind']
        
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        
        return units

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)


def calculate_segmentation_accuracy(units, error_function='percentage_correctness'):
    
    
    
    start_errors = np.array([])
    segmentation_accs = np.array([])
    end_errors = np.array([])

    for unit in units:
        error_tuple = unit._.get_label_and_error(error_function=error_function)

        if len(error_tuple) != 0:
            label_position = np.argmax([error[1] for label, error in error_tuple])

            start_errors = np.append(start_errors,error_tuple[label_position][1][0])

            segmentation_accs = np.append(segmentation_accs, error_tuple[label_position][1][1])

            end_errors = np.append(end_errors, error_tuple[label_position][1][2])



    start_error = sum((start_errors**2))/len(start_errors)

    end_error = sum((end_errors**2))/len(end_errors)

    segmentation_acc = segmentation_accs.mean()
    
    return (start_error, segmentation_acc, end_error)




# Run
create_extensions(extensions_dict)   



In [3]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [4]:
# Pipelinev1

def text2fv(df, segmentation_mode='sentence', label_mode='adu', threshold=0, n_grams=None ,print_segmentation_error = False):
    
    
    
    # Rename to create_training_data?
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    
    if segmentation_mode != "constituency1":
    
        for doc, context in nlp.pipe(data, as_tuples=True, disable=['benepar']):
            doc._.essay_id = context['id']
            docs.append(doc)
    
    else:
        
        for doc, context in nlp.pipe(data, as_tuples=True):
            doc._.essay_id = context['id']
            docs.append(doc)

    segmented_docs = [segmentation(doc, mode=segmentation_mode ,n_grams=n_grams) for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    if print_segmentation_error:
        print(f"Segmentation Mode: {segmentation_mode}\nAccuracy:{calculate_segmentation_accuracy(units)}")

    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([unit._.get_label(label_mode=label_mode, threshold=threshold) for unit in units])

    return X,y 

In [5]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

In [6]:
##Sklearn Models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report


## Binary Classfiers

#### Simple logistic regression

In [7]:
essays= essays[:5].copy()


train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train)
X_test, y_test = text2fv(test)

In [8]:
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)

LogisticRegression(solver='newton-cg')

In [9]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

         ADU       0.91      0.95      0.93        21
     Non-ADU       0.75      0.60      0.67         5

    accuracy                           0.88        26
   macro avg       0.83      0.78      0.80        26
weighted avg       0.88      0.88      0.88        26



#### Gridsearch for logistic regression 

In [10]:
import warnings
warnings.filterwarnings('ignore')
# parameter grid
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [11]:
logreg = LogisticRegression()
clf = GridSearchCV(logreg, 
                   param_grid=parameters,
                   scoring='accuracy',
                   cv = 10)

In [12]:
clf.fit(X_train,y_train)


GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='accuracy')

In [13]:
print("Tuned Hyperparameters: ", clf.best_params_)
print("Accuracy: ", clf.best_score_)
preds_lr = clf.predict(X_test)
print(classification_report(y_test, preds_lr))

Tuned Hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy:  0.82


In [14]:
preds_lr = clf.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

         ADU       0.91      1.00      0.95        21
     Non-ADU       1.00      0.60      0.75         5

    accuracy                           0.92        26
   macro avg       0.96      0.80      0.85        26
weighted avg       0.93      0.92      0.92        26



In [61]:
#logreg = LogisticRegression( C = 0.1, penalty = 'l2' , solver ='liblinear')
logreg = LogisticRegression( C = 0.01, penalty = 'l2' , solver ='newton-cg')


logreg.fit(X_train, y_train)

LogisticRegression(C=0.01, solver='newton-cg')

In [62]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

         ADU       0.91      0.91      0.91        58
     Non-ADU       0.38      0.38      0.38         8

    accuracy                           0.85        66
   macro avg       0.64      0.64      0.64        66
weighted avg       0.85      0.85      0.85        66



### Simple Random Forests

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)
print(classification_report(y_test, preds_rf))

              precision    recall  f1-score   support

         ADU       0.91      1.00      0.95        58
     Non-ADU       1.00      0.25      0.40         8

    accuracy                           0.91        66
   macro avg       0.95      0.62      0.68        66
weighted avg       0.92      0.91      0.88        66



### RandomizedSearchCV for random forest

In [65]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [66]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 16.7min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [67]:
print("Tuned Hyperparameters: ", rf_random.best_params_)
print("Accuracy: ", rf_random.best_score_)

Tuned Hyperparameters:  {'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
Accuracy:  0.8318096765570703


In [68]:
rf = RandomForestClassifier(n_estimators= 600, 
                            min_samples_split= 10, 
                            min_samples_leaf = 1, 
                            max_features= 'sqrt', 
                            max_depth= 40, 
                            bootstrap= False)
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)
print(classification_report(y_test, preds_rf))

              precision    recall  f1-score   support

         ADU       0.90      0.98      0.94        58
     Non-ADU       0.67      0.25      0.36         8

    accuracy                           0.89        66
   macro avg       0.79      0.62      0.65        66
weighted avg       0.88      0.89      0.87        66



## Naive Bayes Classifier

In [69]:
from sklearn.naive_bayes import GaussianNB

In [71]:
#NB doesn't have any hyperparameters to tune.
gnb = GaussianNB()
gnb.fit(X_train, y_train)
preds_gnb = gnb.predict(X_test)
print(classification_report(y_test, preds_gnb))

              precision    recall  f1-score   support

         ADU       0.90      0.93      0.92        58
     Non-ADU       0.33      0.25      0.29         8

    accuracy                           0.85        66
   macro avg       0.62      0.59      0.60        66
weighted avg       0.83      0.85      0.84        66



### XGBOOST Classifier

In [75]:
from xgboost import XGBClassifier


In [79]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ADU       0.95      0.90      0.92        58
     Non-ADU       0.45      0.62      0.53         8

    accuracy                           0.86        66
   macro avg       0.70      0.76      0.72        66
weighted avg       0.89      0.86      0.87        66



### GridSearch XGBOOST

In [82]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}


In [83]:
# Init classifier
xgb_cl = XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_xgb = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

xgb_grid = grid_xgb.fit(X_train, y_train)



In [84]:
print("Tuned Hyperparameters: ", xgb_grid.best_params_)
print("Accuracy: ", xgb_grid.best_score_)

Tuned Hyperparameters:  {'colsample_bytree': 0.5, 'gamma': 0.25, 'learning_rate': 0.01, 'max_depth': 3, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.8}
Accuracy:  0.8844156991215814


In [85]:
xgb_model = XGBClassifier(colsample_bytree = 0.5,
                          gamma= 0.25, 
                          learning_rate = 0.01, 
                          max_depth = 3, 
                          reg_lambda = 0, 
                          scale_pos_weight = 1, 
                          subsample = 0.8)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ADU       0.93      0.95      0.94        58
     Non-ADU       0.57      0.50      0.53         8

    accuracy                           0.89        66
   macro avg       0.75      0.72      0.74        66
weighted avg       0.89      0.89      0.89        66



### SVM Classifier

In [96]:
from sklearn import svm

In [97]:
clf = svm.SVC(kernel='linear')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ADU       0.87      0.95      0.91        21
     Non-ADU       0.67      0.40      0.50         5

    accuracy                           0.85        26
   macro avg       0.77      0.68      0.70        26
weighted avg       0.83      0.85      0.83        26



In [93]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
 
svm_grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
svm_grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.816, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.862, total=   0.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.839, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.839, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.860, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.770, total=   0.1s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.805, total=   0.1s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.793, total=   0.1s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.770, total=   0.1s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.837, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.759, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.759, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .

[CV] ..... C=0.1, gamma=0.001, kernel=poly, score=0.759, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=poly .................................
[CV] ..... C=0.1, gamma=0.001, kernel=poly, score=0.759, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=poly .................................
[CV] ..... C=0.1, gamma=0.001, kernel=poly, score=0.770, total=   0.1s
[CV] C=0.1, gamma=0.001, kernel=poly .................................
[CV] ..... C=0.1, gamma=0.001, kernel=poly, score=0.770, total=   0.1s
[CV] C=0.1, gamma=0.001, kernel=poly .................................
[CV] ..... C=0.1, gamma=0.001, kernel=poly, score=0.767, total=   0.1s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.759, total=   0.1s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.759, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.805, total=   0.1s
[CV] C=1, gamma=0.1, kernel=poly .....................................
[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.793, total=   0.1s
[CV] C=1, gamma=0.1, kernel=poly .....................................
[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.770, total=   0.1s
[CV] C=1, gamma=0.1, kernel=poly .....................................
[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.837, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.816, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.851, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.885, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .

[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.759, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.759, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.770, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.770, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.767, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.759, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.759, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] .

[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.724, total=   0.2s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.814, total=   0.4s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.793, total=   0.0s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.805, total=   0.1s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.793, total=   0.0s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.770, total=   0.1s
[CV] C=10, gamma=0.01, kernel=poly ...................................
[CV] ....... C=10, gamma=0.01, kernel=poly, score=0.837, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] .

[CV] ....... C=100, gamma=1, kernel=linear, score=0.770, total=   0.3s
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] ....... C=100, gamma=1, kernel=linear, score=0.667, total=   0.3s
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] ....... C=100, gamma=1, kernel=linear, score=0.759, total=   0.3s
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] ....... C=100, gamma=1, kernel=linear, score=0.756, total=   0.4s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.770, total=   0.1s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.805, total=   0.1s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.793, total=   0.1s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] .

[CV] ... C=100, gamma=0.001, kernel=linear, score=0.655, total=   0.5s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.770, total=   0.7s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.667, total=   0.3s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.759, total=   0.3s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.756, total=   0.4s
[CV] C=100, gamma=0.001, kernel=poly .................................
[CV] ..... C=100, gamma=0.001, kernel=poly, score=0.839, total=   0.0s
[CV] C=100, gamma=0.001, kernel=poly .................................
[CV] ..... C=100, gamma=0.001, kernel=poly, score=0.816, total=   0.0s
[CV] C=100, gamma=0.001, kernel=poly .................................
[CV] .

[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.655, total=   0.4s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.770, total=   0.3s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.667, total=   0.3s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.759, total=   0.3s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.756, total=   0.4s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] ...... C=1000, gamma=0.1, kernel=poly, score=0.770, total=   0.1s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] ...... C=1000, gamma=0.1, kernel=poly, score=0.805, total=   0.1s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] .

[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.655, total=   0.4s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.770, total=   0.3s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.667, total=   0.3s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.759, total=   0.3s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.756, total=   0.5s
[CV] C=1000, gamma=0.0001, kernel=poly ...............................
[CV] ... C=1000, gamma=0.0001, kernel=poly, score=0.759, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=poly ...............................
[CV] ... C=1000, gamma=0.0001, kernel=poly, score=0.759, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=poly ...............................
[CV] .

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   44.1s finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             verbose=3)

In [94]:
print("Tuned Hyperparameters: ", svm_grid.best_params_)
print("Accuracy: ", svm_grid.best_score_)

Tuned Hyperparameters:  {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy:  0.8570703020582732


In [99]:
print("Tuned Hyperparameters: ", svm_grid.best_params_)
print("Accuracy: ", svm_grid.best_score_)
clf = svm.SVC(C = 1, 
              gamma = 0.01, 
              kernel = 'rbf')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ADU       0.94      0.88      0.91        58
     Non-ADU       0.42      0.62      0.50         8

    accuracy                           0.85        66
   macro avg       0.68      0.75      0.71        66
weighted avg       0.88      0.85      0.86        66



## Multiclass Classifiers

## Two Binary Classifiers

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [19]:
#Create Feature vector from essays
essays= essays[:30].copy()

def binary_Classifiers(essays):
    
    train = essays[essays['label'] =='train']
    test =essays[essays['label'] =='test']
    
    X_train, y_train = text2fv(train, segmentation_mode='sentence', label_mode='adu')
    X_test, y_test = text2fv(train, segmentation_mode='sentence', label_mode='adu')
    
    
    X_train_clpr, y_train_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')
    X_test_clpr, y_test_clpr = text2fv(train, segmentation_mode='sentence', label_mode='clpr')
    
    
    clpr_index_train = np.where(y_train_clpr!='Non-ADU')[0]
    clpr_index_test = np.where(y_test_clpr!='Non-ADU')[0]

    X_train_clpr_only = X_train[clpr_index_train].copy()
    X_test_clpr_only = X_test[clpr_index_test].copy()


    y_train_clpr_only = y_train_clpr[clpr_index_train].copy()

    y_test_clpr_only = y_test_clpr[clpr_index_test].copy()
    
    return X_test, y_test, X_train, y_train, X_train_clpr_only, y_train_clpr_only,X_test_clpr, y_test_clpr

X_test, y_test, X_train, y_train, X_train_clpr_only, y_train_clpr_only,X_test_clpr, y_test_clpr = binary_Classifiers(essays)

In [16]:
cl1 = LogisticRegression(solver='newton-cg')
cl1.fit(X_train, y_train)

LogisticRegression(solver='newton-cg')

In [17]:
cl2 = LogisticRegression(solver='newton-cg')
cl2.fit(X_train_clpr_only, y_train_clpr_only)

LogisticRegression(solver='newton-cg')

In [20]:
preds_cl1 = cl1.predict(X_test)

preds_cl1_adu_index = np.where(preds_cl1=='ADU')


X_test_cl1_pred_adu = X_test[preds_cl1_adu_index]
y_test_cl1_pred_adu = y_test[preds_cl1_adu_index]


preds_cl2 = cl2.predict(X_test_cl1_pred_adu)

preds_all = preds_cl1.copy()
preds_all[preds_cl1_adu_index] = preds_cl2
preds_all 

print(classification_report(preds_all, y_test_clpr))

              precision    recall  f1-score   support

       Claim       0.65      0.71      0.68       120
     Non-ADU       0.57      0.88      0.69        66
     Premise       0.88      0.71      0.79       248

    accuracy                           0.74       434
   macro avg       0.70      0.77      0.72       434
weighted avg       0.77      0.74      0.74       434



## Pipeline

In [179]:
##Sklearn Models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')


# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

# essays
essays= essays[:5].copy()
train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']


classifiers = ['LR', 'RF', 'NB', 'XGB', 'SVM']

#segmentations = ['sentence', 'paragraph', 'n_grams', 'clause', 'constituency1', 'token', 'gold_standard']3
#segmentations = ['sentence', 'n_grams', 'constituency1', 'gold_standard']
segmentations = ['sentence']
                 
#classifications = ['binary', 'multiclass', 'two_binary']
classifications = ['multiclass']



def pipeline():
    
    for classification in classifications:
        print(classification)
        classification_type(classification)
        
        
    


def classification_type(classification):
    
    if classification == 'binary':
        
        for segmentation in segmentations:
            print(segmentation)
            X_train, y_train = text2fv(train)            
            X_test, y_test  = text2fv(test)
            train_test_classifer(X_train, y_train, X_test, y_test)
    
    if classification  == 'multiclass':

        for segmentation in segmentations:

            X_train, y_train = text2fv(train, segmentation_mode=segmentation, label_mode='clpr')
            X_test, y_test  = text2fv(test, segmentation_mode=segmentation, label_mode='clpr')
            train_test_classifer(X_train, y_train, X_test, y_test)


    if classification == 'two_binary':

        for segmentation in segmentations:

            X_train, y_train = text2fv(train, segmentation_mode=segmentation, label_mode='adu')
            X_test, y_test  = text2fv(test, segmentation_mode=segmentation, label_mode='adu' )

            
            

def train_test_classifer(X_train, y_train, X_test, y_test):
    
    for classifier in classifiers:
        
        if classifier == 'LR':
            print("logistic_regression")
            logistic_regression(X_train, y_train, X_test, y_test)
        
        if classifier == 'RF':
            print("random_forest")
            random_forest(X_train, y_train, X_test, y_test)
            
        if classifier == 'NB':
            print("naive_bayes")
            naive_bayes(X_train, y_train, X_test, y_test)
        
        if classifier == 'XGB':
            print("xgboost")
            xgboost(X_train, y_train, X_test, y_test)
        
        if classifier == 'SVM':
            print("svm")

            svm(X_train, y_train, X_test, y_test)
            

def logistic_regression(X_train, y_train, X_test, y_test):
    
    logreg = LogisticRegression(solver='newton-cg')
    logreg.fit(X_train, y_train)
    preds_lr = logreg.predict(X_test)
    print(classification_report(y_test, preds_lr))
    
    ####################################################
    # parameter grid
    parameters = {
        'penalty' : ['l1','l2'], 
        'C'       : np.logspace(-3,3,7),
        'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
    }
    
    logreg = LogisticRegression()
    clf = GridSearchCV(logreg, 
                       param_grid=parameters,
                       scoring='accuracy',
                       cv = 10)
    
    clf.fit(X_train,y_train)
    print("Tuned Hyperparameters: ", clf.best_params_)
    print("Accuracy: ", clf.best_score_)
    
    preds_lr = clf.predict(X_test)
    print(classification_report(y_test, preds_lr))
    

def random_forest(X_train, y_train, X_test, y_test):
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    print(classification_report(y_test, preds_rf))
    
    ####################################################
    # parameter grid
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    print("Tuned Hyperparameters: ", rf_random.best_params_)
    print("Accuracy: ", rf_random.best_score_)
    
    preds_rf_random = rf_random.predict(X_test)
    print(classification_report(y_test, preds_rf_random))

def naive_bayes(X_train, y_train, X_test, y_test):
    
    #NB doesn't have any hyperparameters to tune.
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    preds_gnb = gnb.predict(X_test)
    print(classification_report(y_test, preds_gnb))
    

def xgboost(X_train, y_train, X_test, y_test):
   
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)

    print(classification_report(y_test, y_pred))
    
    ####################################################
    # parameter grid
    param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    }

    # Init classifier
    xgb_cl = XGBClassifier()
    # Init Grid Search
    grid_xgb = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3)
    xgb_grid = grid_xgb.fit(X_train, y_train)
    
    
    print("Tuned Hyperparameters: ", xgb_grid.best_params_)
    print("Accuracy: ", xgb_grid.best_score_)

    y_pred = xgb_grid.predict(X_test)

    print(classification_report(y_test, y_pred))
    

def svm(X_train, y_train, X_test, y_test):
    
    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    
    ####################################################
    # parameter grid
    # defining parameter range
    param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

    svm_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

    # fitting the model for grid search
    svm_grid.fit(X_train, y_train)
    
    print("Tuned Hyperparameters: ", svm_grid.best_params_)
    print("Accuracy: ", svm_grid.best_score_)

    y_pred = svm_grid.predict(X_test)

    print(classification_report(y_test, y_pred))

In [142]:
pipeline()

binary
sentence
logistic_regression
              precision    recall  f1-score   support

         ADU       0.91      0.95      0.93        21
     Non-ADU       0.75      0.60      0.67         5

    accuracy                           0.88        26
   macro avg       0.83      0.78      0.80        26
weighted avg       0.88      0.88      0.88        26

Tuned Hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy:  0.82
              precision    recall  f1-score   support

         ADU       0.91      1.00      0.95        21
     Non-ADU       1.00      0.60      0.75         5

    accuracy                           0.92        26
   macro avg       0.96      0.80      0.85        26
weighted avg       0.93      0.92      0.92        26

random_forest
              precision    recall  f1-score   support

         ADU       0.84      1.00      0.91        21
     Non-ADU       1.00      0.20      0.33         5

    accuracy                           0.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.9min finished


Tuned Hyperparameters:  {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
Accuracy:  0.7608695652173914
              precision    recall  f1-score   support

         ADU       0.84      1.00      0.91        21
     Non-ADU       1.00      0.20      0.33         5

    accuracy                           0.85        26
   macro avg       0.92      0.60      0.62        26
weighted avg       0.87      0.85      0.80        26

naive_bayes
              precision    recall  f1-score   support

         ADU       0.83      0.95      0.89        21
     Non-ADU       0.50      0.20      0.29         5

    accuracy                           0.81        26
   macro avg       0.67      0.58      0.59        26
weighted avg       0.77      0.81      0.77        26

xgboost
              precision    recall  f1-score   support

         ADU       0.88      1.00      0.93        21
     Non-ADU       1.00      0.40 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ... C=0.1, gamma=0.01, kernel=sigmoid, score=0.700, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=0.1, gamma=0.01, kernel=sigmoid, score=0.667, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=0.1, gamma=0.01, kernel=sigmoid, score=0.778, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=0.1, gamma=0.01, kernel=sigmoid, score=0.778, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=0.1, gamma=0.01, kernel=sigmoid, score=0.778, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.700, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.556, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] .

[CV] ......... C=1, gamma=0.1, kernel=poly, score=1.000, total=   0.0s
[CV] C=1, gamma=0.1, kernel=poly .....................................
[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.667, total=   0.0s
[CV] C=1, gamma=0.1, kernel=poly .....................................
[CV] ......... C=1, gamma=0.1, kernel=poly, score=0.778, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.700, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.667, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .

[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.667, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.778, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.800, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.667, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=1.000, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ......... C=10, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=10, gamma=0.1, kernel=sigmoid .................................
[CV] .

[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.667, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=1.000, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.778, total=   0.0s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=100, gamma=0.1, kernel=sigmoid, score=0.700, total=   0.0s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=100, gamma=0.1, kernel=sigmoid, score=0.667, total=   0.0s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .... C=100, gamma=0.1, kernel=sigmoid, score=0.778, total=   0.0s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .

[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.556, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.889, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.889, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=linear ................................
[CV] .... C=1000, gamma=0.1, kernel=linear, score=0.556, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] ...... C=1000, gamma=0.1, kernel=poly, score=0.700, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] ...... C=1000, gamma=0.1, kernel=poly, score=0.667, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] ...... C=1000, gamma=0.1, kernel=poly, score=1.000, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=poly ..................................
[CV] .

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.1s finished


In [193]:
pipeline()

multiclass
logistic_regression
              precision    recall  f1-score   support

       Claim       0.86      0.60      0.71        10
     Non-ADU       0.50      0.60      0.55         5
     Premise       0.62      0.73      0.67        11

    accuracy                           0.65        26
   macro avg       0.66      0.64      0.64        26
weighted avg       0.69      0.65      0.66        26

Tuned Hyperparameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy:  0.585
              precision    recall  f1-score   support

       Claim       0.83      0.50      0.62        10
     Non-ADU       0.60      0.60      0.60         5
     Premise       0.60      0.82      0.69        11

    accuracy                           0.65        26
   macro avg       0.68      0.64      0.64        26
weighted avg       0.69      0.65      0.65        26

random_forest
              precision    recall  f1-score   support

       Claim       0.67      0.40      0.50   

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.8min finished


Tuned Hyperparameters:  {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
Accuracy:  0.6304347826086957
              precision    recall  f1-score   support

       Claim       0.67      0.40      0.50        10
     Non-ADU       1.00      0.20      0.33         5
     Premise       0.47      0.82      0.60        11

    accuracy                           0.54        26
   macro avg       0.71      0.47      0.48        26
weighted avg       0.65      0.54      0.51        26

naive_bayes
              precision    recall  f1-score   support

       Claim       0.71      0.50      0.59        10
     Non-ADU       0.60      0.60      0.60         5
     Premise       0.50      0.64      0.56        11

    accuracy                           0.58        26
   macro avg       0.60      0.58      0.58        26
weighted avg       0.60      0.58      0.58        26

xgboost
              precision    recall  

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.700, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.444, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.778, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.333, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.444, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.700, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.556, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] .

[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.444, total=   0.0s
[CV] C=1, gamma=0.1, kernel=sigmoid ..................................
[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.444, total=   0.0s
[CV] C=1, gamma=0.1, kernel=sigmoid ..................................
[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.444, total=   0.0s
[CV] C=1, gamma=0.1, kernel=sigmoid ..................................
[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.444, total=   0.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.700, total=   0.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.444, total=   0.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.889, total=   0.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] .

[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.778, total=   0.0s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.444, total=   0.0s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.444, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.700, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.556, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.778, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] ........ C=10, gamma=0.1, kernel=poly, score=0.444, total=   0.0s
[CV] C=10, gamma=0.1, kernel=poly ....................................
[CV] .

[CV] ....... C=100, gamma=1, kernel=linear, score=0.444, total=   0.0s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.700, total=   0.0s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.556, total=   0.0s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.778, total=   0.0s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.444, total=   0.0s
[CV] C=100, gamma=1, kernel=poly .....................................
[CV] ......... C=100, gamma=1, kernel=poly, score=0.444, total=   0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.400, total=   0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .

[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.778, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.556, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.556, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=sigmoid .............................
[CV] . C=100, gamma=0.0001, kernel=sigmoid, score=0.700, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=sigmoid .............................
[CV] . C=100, gamma=0.0001, kernel=sigmoid, score=0.556, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=sigmoid .............................
[CV] . C=100, gamma=0.0001, kernel=sigmoid, score=0.778, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=sigmoid .............................
[CV] . C=100, gamma=0.0001, kernel=sigmoid, score=0.556, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=sigmoid .............................
[CV] .

[CV] . C=1000, gamma=0.001, kernel=sigmoid, score=0.667, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=sigmoid .............................
[CV] . C=1000, gamma=0.001, kernel=sigmoid, score=0.556, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=sigmoid .............................
[CV] . C=1000, gamma=0.001, kernel=sigmoid, score=0.444, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.700, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.444, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.778, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.444, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] .

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.3s finished
