In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load("en_core_web_md")
import re

from itertools import chain

In [2]:
#from spacy import displacy
#import deplacy

In [3]:
doc_features = ['num_tokens', 'para_starts']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'is_para_start',
                 'index_in_doc']

# getters that are not used as features
span_utilities = ['prev_unit', 'label', 'label_pct']
# methods
span_methods = ['get_nth_unit', 'get_prev_unit_attr']
token_features =['word_emb']



extensions_dict = dict(doc_features=doc_features, span_features=span_features+span_utilities,
                       token_features=token_features, span_methods=span_methods)





def create_extensions(extensions_dict=None, force=True):
    
    # Features that take 'unit' as input refer to the segmentation, they do not work with just any span.
    
    # Property attributes
    
    # Store starting and ending indices of spans in the whole doc
    # 1 list per each document: [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
    Doc.set_extension("units_index_list", default=[],force=True)
    
    # Store essay_id within doc
    Doc.set_extension("essay_id", default=None, force=True)

    
    # Feature Getters
    
    def get_label(span):
        
        # Gets ADU vs non-ADU LABEL for the span (intended only for sentences)

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        #  + len(span[-1]) to get to the end of the last word
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()
    
    
    def get_label_pct(span):
        
        
        pass
    
    def get_para_starts(doc):
        # Units starting with \n or preceding \n are considered as paragraph starts
        # if start is 0, start -1 goes back to the last token of the doc

        # TODO
        # para_ends can be obtained by shifing this list to the right by one position
        return [int(doc[start].text =='\n' or doc[start-1].text=='\n') for start, end in doc._.units_index_list]
    
    def get_is_para_start(unit):
        
        para_starts = unit.doc._.para_starts
        unit_ind = unit._.index_in_doc
        
        return para_starts[unit_ind]
        
    
    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    

    def get_index_in_doc(span):
        """Gets index of the segmented unit in the doc"""
        span_start = span.start

        # span end not used yet
        span_end = span.end

        # finds where span_start is in units_index_list [(s1_start, s1_end), (s2_start, s2_end),.., (sn_start, sn_end)]
        # returns the index of the corresponding span
        return np.where([span.start in range(start, end) for start, end in span.doc._.units_index_list])[0][-1]


    def get_prev_unit(span):

        return span._.get_nth_unit(span._.index_in_doc-1)
    
        
    def get_nth_unit(span, n):

        # Tuple containing the start and end index of the nth span
        span_index = span.doc._.units_index_list[n]

        # Return nth span
        return span.doc[span_index[0]: span_index[1]]



    def get_prev_unit_attr(span, attribute):

        return span._.prev_unit._.get(attribute)
    
    

    # Iterate list of features and Set Extensions (Just to not manually set extensions one by one)
    
    for feature in extensions_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in extensions_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for method in extensions_dict['span_methods']:
        Span.set_extension(method, force=force, method=locals()[method])


def segmentation(doc=None ,mode = 'sentence', n_grams=15):
    if mode=='paragraph':
        pass
    if mode=='sentence':
        # segment by sentences
        units = [sent for sent in doc.sents  if not (sent.text.isspace() or sent.text =='')] 
        
        # keep track of (start, end) of units in doc object
        doc._.units_index_list = [(unit.start, unit.end) for unit in units]
        return units
    
    if mode =='n_grams':
        # Code to segment with 15 grams here (average)  
        units = [doc[i:i+n_grams] for i in range(len(doc))]

        doc._.units_index_list = [(unit.start, unit.end) for unit in units]

        return units
    
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]

def unit2fv(unit, feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)

# Run
create_extensions(extensions_dict)   


In [4]:
# Optional, not used yet. Trying to solve problem that title gets included with the first sentence
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)
# Not used
def text2doc(text):
    # need to use nlp.pipe here instead
    return nlp(text)

In [5]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

In [14]:

###### TEST
in_text = essays.iloc[23].text
doc = nlp(in_text)
units=segmentation(doc)
doc._.para_starts
adu24 = adus[adus['essay_id'] == 'essay024']
adu24

doc._.essay_id = 'essay024'

#units = segmentation(doc, mode="n_grams")


units[5]._.index_in_doc


5

In [11]:
# Pipelinev1

def text2fv(df):
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
        
    segmented_docs = [segmentation(doc, mode='sentence') for doc in docs]
    
    # Flatten lists (Dissolve docs boundaries and store all units together in one huge list)
    units = list(chain.from_iterable(segmented_docs))
    
    X_features = span_features
    

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([int(unit._.label) for unit in units])
    
    return X,y 

In [None]:
# train = essays[essays['label'] =='train']
# test =essays[essays['label'] =='test']

# X_train, y_train = text2fv(train)

# X_test, y_test = text2fv(test)



In [14]:
# Smaller set

essays= essays[:30].copy()

train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train)

X_test, y_test = text2fv(test)


# Classification

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)


LogisticRegression(solver='newton-cg')

In [17]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

           0       0.40      0.50      0.44         8
           1       0.93      0.90      0.91        58

    accuracy                           0.85        66
   macro avg       0.66      0.70      0.68        66
weighted avg       0.86      0.85      0.86        66



In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [24]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
preds = rf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 168,   94],
       [  22, 1113]])

In [32]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74       262
           1       0.92      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.90      0.81      0.85      1397
weighted avg       0.92      0.92      0.91      1397



## CrossValidation

In [26]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone

In [27]:
""" Stochastic Gradient Descent (SGD) classifier, 
This classifier has the advantage of being capable of handling very large datasets efficiently"""
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(random_state=42)

In [47]:
skfolds = StratifiedKFold(n_splits=5, random_state=42)

best_model = None 
precision = 0
for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    if precision < n_correct / len(y_pred):
        best_model = clone_clf
        precision = n_correct / len(y_pred)
    print(n_correct / len(y_pred))
    print(confusion_matrix(y_test_fold, y_pred))
    



0.7224231464737794
[[181  53]
 [254 618]]
0.8707052441229657
[[145  89]
 [ 54 818]]
0.8471971066907775
[[110 124]
 [ 45 827]]
0.8426763110307414
[[ 69 164]
 [ 10 863]]
0.8090497737556561
[[183  50]
 [161 711]]


In [48]:
preds = best_model.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 173,   89],
       [  81, 1054]])

In [50]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67       262
           1       0.92      0.93      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.80      0.79      0.80      1397
weighted avg       0.88      0.88      0.88      1397



In [78]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [91]:
svm_clf = LinearSVC(random_state=0, tol=1e-5, verbose=1, max_iter=50000)

In [92]:
svm_clf.fit(X_train, y_train)

[LibLinear]



LinearSVC(max_iter=50000, random_state=0, tol=1e-05, verbose=1)

In [95]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 127,  135],
       [  35, 1100]])

In [96]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.78      0.48      0.60       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.84      0.73      0.76      1397
weighted avg       0.87      0.88      0.87      1397



In [97]:
from sklearn import svm
svm_clf = svm.SVC(kernel='linear')


In [98]:
svm_clf.fit(X_train, y_train)

SVC(kernel='linear')

In [99]:
preds = svm_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 132,  130],
       [  32, 1103]])

In [100]:
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.80      0.50      0.62       262
           1       0.89      0.97      0.93      1135

    accuracy                           0.88      1397
   macro avg       0.85      0.74      0.78      1397
weighted avg       0.88      0.88      0.87      1397



### Hard Voting 

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [105]:
log_clf = LogisticRegression(solver='newton-cg')
rnd_clf = RandomForestClassifier()
smv_clf = SVC()

In [106]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', smv_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(solver='newton-cg')),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [108]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8869005010737294
RandomForestClassifier 0.9112383679312813
SVC 0.8840372226198998
VotingClassifier 0.9226914817465999


In [114]:
preds = voting_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 176,   86],
       [  22, 1113]])

In [115]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.67      0.77       262
           1       0.93      0.98      0.95      1135

    accuracy                           0.92      1397
   macro avg       0.91      0.83      0.86      1397
weighted avg       0.92      0.92      0.92      1397



## Bagging and Pasting

In [109]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [110]:
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1)

In [111]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [112]:
preds = bag_clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 179,   83],
       [  61, 1074]])

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       262
           1       0.93      0.95      0.94      1135

    accuracy                           0.90      1397
   macro avg       0.84      0.81      0.83      1397
weighted avg       0.89      0.90      0.90      1397

