In [1]:
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load("en_core_web_md")

In [2]:
from spacy import displacy
import deplacy

In [3]:
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

In [4]:
def text2doc(text):
    return nlp(text)

In [5]:
def segmentation(doc=None ,mode = 'sentence'):
    if mode=='paragraph':
        pass
    if mode=='sentence':
        return [sent for sent in doc.sents] #if not sent.text.isspace()]
    if mode =='avg_n_grams':
        # Code to segment with 15 grams here (aveage)    
        pass
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc]# if not token.isspace()]

In [6]:
# With Indexing
def segmentation(doc=None ,mode = 'sentence'):
    if mode=='paragraph':
        return 
    if mode=='sentence':
        return [(i,sent) for i,sent in enumerate(doc.sents)] #if not sent.text.isspace()]
    if mode =='avg_n_grams':
        # Code to segment with 15 grams here (aveage)    
        pass
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc]# if not token.isspace()]

In [7]:
conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']

In [8]:
doc_features = ['num_tokens']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct']
token_features =['word_emb']
features_dict = dict(doc_features=doc_features, span_features=span_features, token_features=token_features)





def create_extensions(features_dict=None, force=True):
    
    # Feature Getters

    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "PUNCT"])
    
    # Set Extensions
    
    for feature in features_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
create_extensions(features_dict)   

In [9]:
# RUNNING THE FUNCTIONS
input_text = essays[essays['essay_id'] == 'essay024']['text'].iloc[0]
doc = text2doc(input_text)

units = segmentation(doc=doc, mode='sentence')
