In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load("en_core_web_md")
import re

In [2]:
from spacy import displacy
import deplacy

In [3]:
doc_features = ['num_tokens']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct']
token_features =['word_emb']
features_dict = dict(doc_features=doc_features, span_features=span_features, token_features=token_features)





def create_extensions(features_dict=None, force=True):
    
    # Feature Getters

    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "PUNCT"])
    
    # Set Extensions
    
    for feature in features_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
create_extensions(features_dict)   

In [4]:
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")

In [5]:
# Optional
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)

In [6]:
def text2doc(text):
    return nlp(text)

In [7]:
def segmentation(doc=None ,mode = 'sentence'):
    if mode=='paragraph':
        pass
    if mode=='sentence':
        return [sent for sent in doc.sents if not (sent.text.isspace() or sent.text =='')] #if not sent.text.isspace()]
    if mode =='avg_n_grams':
        # Code to segment with 15 grams here (aveage)    
        pass
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]

In [8]:
# # With Indexing
# def segmentation(doc=None ,mode = 'sentence'):
#     if mode=='paragraph':
#         return 
#     if mode=='sentence':
#         return [(i,sent) for i,sent in enumerate(doc.sents)] #if not sent.text.isspace()]
#     if mode =='avg_n_grams':
#         # Code to segment with 15 grams here (aveage)    
#         pass
#     if mode=='clause':
#         # Code to segment by clause
#         pass
#     if mode=='token':
#         return [token for token in doc]# if not token.isspace()]

In [9]:
# RUNNING THE FUNCTIONS
input_text = essays[essays['essay_id'] == 'essay024']['text'].iloc[0]

doc = text2doc(input_text)

units = segmentation(doc=doc, mode='sentence')


In [10]:
list2 = [units[0]._.get(feature) for feature in span_features]


list1 = [units[0]._.word_emb, units[0]._.num_tokens]


In [11]:
def unit2fv(unit,feature_list):
    
    return np.array([unit._.get(feature) for feature in feature_list])
    

In [12]:
fv = list(map(lambda x: unit2fv(x, feature_list=span_features), units))

  return np.array([unit._.get(feature) for feature in feature_list])


In [13]:
fv

[array([array([-7.07123950e-02,  2.28336051e-01, -6.30690828e-02, -1.20215610e-01,
               -3.06833927e-02,  3.11264042e-02, -4.28855093e-03, -1.85282588e-01,
               -8.99868459e-03,  2.13907385e+00, -1.79377705e-01,  2.29657814e-02,
                5.91394193e-02,  6.79127127e-02, -8.22178572e-02, -4.96857017e-02,
               -5.52998930e-02,  1.45510614e+00, -2.02382833e-01, -2.04980448e-02,
               -8.63708649e-03, -6.18563928e-02, -1.46492347e-01, -6.03598822e-03,
                8.59517753e-02,  3.82372625e-02,  4.53616008e-02,  1.74828023e-02,
                2.08504777e-02, -4.77647409e-03, -3.64069082e-02, -5.12647405e-02,
                1.66943055e-02,  6.33748695e-02,  1.49229512e-01, -5.17957509e-02,
                3.97809260e-02,  9.74137560e-02,  3.65610165e-03, -1.58567838e-02,
                1.00446619e-01, -3.61920446e-02, -5.36392629e-02, -1.03529580e-01,
               -3.68291396e-03,  7.55973384e-02, -2.02683613e-01, -5.06095737e-02,
    

In [14]:
adus['essay_id']

0       essay001
1       essay001
2       essay001
3       essay001
4       essay001
          ...   
6084    essay402
6085    essay402
6086    essay402
6087    essay402
6088    essay402
Name: essay_id, Length: 6089, dtype: object

In [15]:
units

[Computer has negative effects to children
 
 Nowadays, thanks to the development of technology, computer is now indispensable to life.,
 Some people think that computer is good for children and it should be used daily by children but some others think differently.,
 In my opinion, the latter opinion is true.,
 
 First, using computer constantly has bad influence on children's eyes.,
 When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted.,
 
 Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life.,
 It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences.,
 For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a non-stop playing for 3 days.,
 
 

In [16]:
adus

Unnamed: 0,ADU_index,ADU,essay_id,label,start_ind,end_ind,claim_type,ADU_text
0,T1,MajorClaim 503 575\twe should attach more impo...,essay001,train,503,575,MajorClaim,we should attach more importance to cooperatio...
1,T2,MajorClaim 2154 2231\ta more cooperative attit...,essay001,train,2154,2231,MajorClaim,a more cooperative attitudes towards life is m...
2,T3,"Claim 591 714\tthrough cooperation, children c...",essay001,train,591,714,Claim,"through cooperation, children can learn about ..."
3,T4,Premise 716 851\tWhat we acquired from team wo...,essay001,train,716,851,Premise,What we acquired from team work is not only ho...
4,T5,Premise 853 1086\tDuring the process of cooper...,essay001,train,853,1086,Premise,"During the process of cooperation, children ca..."
...,...,...,...,...,...,...,...,...
6084,T11,Premise 1275 1339\tindirectly they will learn ...,essay402,train,1275,1339,Premise,indirectly they will learn how to socialize ea...
6085,T12,Premise 1341 1388\tThat will make children get...,essay402,train,1341,1388,Premise,That will make children getting lots of friends
6086,T13,Premise 1393 1436\tthey can contribute positiv...,essay402,train,1393,1436,Premise,they can contribute positively to community
6087,T14,Premise 1448 1525\tplaying sport makes childre...,essay402,train,1448,1525,Premise,playing sport makes children getting healthy a...


# Classification

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
logreg = LogisticRegression()

In [21]:
units[0]

Computer has negative effects to children

Nowadays, thanks to the development of technology, computer is now indispensable to life.

In [22]:
input_text

"Computer has negative effects to children\n\nNowadays, thanks to the development of technology, computer is now indispensable to life. Some people think that computer is good for children and it should be used daily by children but some others think differently. In my opinion, the latter opinion is true.\nFirst, using computer constantly has bad influence on children's eyes. When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted.\nMoreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life. It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences. For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a non-stop playing for 3 days.\nFinally, even peopl

In [26]:
essays['text'].iloc[201]


'Some important characteristics of a co-worker\n\nAt work, you meet people with diverse backgrounds and you are required to maintain sound relationships with many of them in order to have a pleasant work environment. However, the relationship you need to maintain with your co-worker is the most important one since he or she is the person who works very closely with you, and in most instances you have to rely on him or her. Therefore in my view, a good co-worker needs to have three most important characteristics such as cooperation, trustworthiness, and fellowship.\nThe co-workers should be cooperative with one another. If they do not cooperate, there will be lots of disputes. I once had a co-worker who was too family conscious. Therefore he always left office sharp on time without giving any consideration whether the work has been finished or not. Most of the time, I had to do his work in order to protect him from getting caught to the supervisor who was a very authoritative person. Ho

In [28]:
adus[adus['essay_id'] == 'essay001']

Unnamed: 0,ADU_index,ADU,essay_id,label,start_ind,end_ind,claim_type,ADU_text
0,T1,MajorClaim 503 575\twe should attach more impo...,essay001,train,503,575,MajorClaim,we should attach more importance to cooperatio...
1,T2,MajorClaim 2154 2231\ta more cooperative attit...,essay001,train,2154,2231,MajorClaim,a more cooperative attitudes towards life is m...
2,T3,"Claim 591 714\tthrough cooperation, children c...",essay001,train,591,714,Claim,"through cooperation, children can learn about ..."
3,T4,Premise 716 851\tWhat we acquired from team wo...,essay001,train,716,851,Premise,What we acquired from team work is not only ho...
4,T5,Premise 853 1086\tDuring the process of cooper...,essay001,train,853,1086,Premise,"During the process of cooperation, children ca..."
5,T6,Premise 1088 1191\tAll of these skills help th...,essay001,train,1088,1191,Premise,All of these skills help them to get on well w...
6,T7,Claim 1332 1376\tcompetition makes the society...,essay001,train,1332,1376,Claim,competition makes the society more effective
7,T8,Premise 1212 1301\tthe significance of competi...,essay001,train,1212,1301,Premise,the significance of competition is that how to...
8,T9,Premise 1387 1492\twhen we consider about the ...,essay001,train,1387,1492,Premise,when we consider about the question that how t...
9,T10,Premise 1549 1846\tTake Olympic games which is...,essay001,train,1549,1846,Premise,Take Olympic games which is a form of competit...
