In [1]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load("en_core_web_md")
import re

from itertools import chain

In [2]:
from spacy import displacy
import deplacy

In [148]:
doc_features = ['num_tokens']
span_features = ['word_emb', 'num_tokens', 'num_verbs', 'num_pos_pronouns', 'num_conj_adv', 'num_punct', 'label']
token_features =['word_emb']
features_dict = dict(doc_features=doc_features, span_features=span_features, token_features=token_features)





def create_extensions(features_dict=None, force=True):
    
    
    

    
    # Feature Getters
    
    def get_label(span):
        
        # ADU vs non-ADU LABEL for the span

        # Works if the span is larger or equal to the adu

        # TODO:
        # DOES NOT WORK IF SPAN IS SMALLER THAN ADU, OR IF ADU IS SPLIT BETWEEN TWO SPANS (NEEDS MORE WORK!!!)
        # CLAIM VS PREMISE
        essay_id = span.doc._.essay_id

        span_start = span[0].idx
        span_end = span[-1].idx  + len(span[-1])
        start_inds = adus[adus['essay_id'] == essay_id ]['start_ind'].values
        end_inds = adus[adus['essay_id'] == essay_id ]['end_ind'].values

        # Checks if starting index of span is smaller than ADU and the ending index of the span is larger than the ADU
        return ((start_inds >= span_start) & (end_inds <= span_end)).any()
    

    def get_word_emb(obj):
        return obj.vector
    
    def get_num_tokens(obj):
        return len(obj)
    
    def get_num_verbs(span):
        return sum([1 for token in span if token.pos_ == "VERB"])

    def get_num_pos_pronouns(span):
        return sum([1 for token in span if token.tag_ == "PRP$"])

    def get_num_pron(span):
        return sum([1 for token in span if token.pos_ == "PRON"])
    
    def get_num_conj_adv(span):
        conj_advs = ['moreover', 'incidentally', 'next', 'yet', 'finally', 'then', 'for example', 'thus', 'accordingly', 'namely', 'meanwhile', 'that is', 'also', 'undoubtedly', 'all in all', 'lately', 'hence', 'still', 'therefore', 'in addition', 'indeed', 'again', 'so', 'nevertheless', 'besides', 'instead', 'for instance', 'certainly', 'however', 'anyway', 'further', 'furthermore', 'similarly', 'now', 'in conclusion', 'nonetheless', 'thereafter', 'likewise', 'otherwise', 'consequently']
        return sum([len(re.findall(adv, span.text.lower())) for adv in conj_advs])
    
    def get_num_punct(span):
        return sum([1 for token in span if token.tag_ == "."])
    
    # Set Extensions
    
    for feature in features_dict['doc_features']:
        Doc.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['span_features']:
        Span.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
    for feature in features_dict['token_features']:
        Token.set_extension(feature, force=force, getter=locals()[f"get_{feature}"])
        
create_extensions(features_dict)   
Doc.set_extension("essay_id", default=None, force=True)


In [4]:
# Optional
def add_full_stops(text):
    """adds full stops to texts that end with \n missing full stops"""
    return re.sub("\n+(?!\.)",'.\n', text)

In [5]:
def text2doc(text):
    return nlp(text)

In [6]:
def segmentation(doc=None ,mode = 'sentence'):
    if mode=='paragraph':
        pass
    if mode=='sentence':
        return [sent for sent in doc.sents if not (sent.text.isspace() or sent.text =='')] #if not sent.text.isspace()]
    if mode =='avg_n_grams':
        # Code to segment with 15 grams here (aveage)    
        pass
    if mode=='clause':
        # Code to segment by clause
        pass
    if mode=='token':
        return [token for token in doc if not (token.text.isspace() or token.text =='')]

In [7]:
def unit2fv(unit,feature_list):
    
    fv = np.array([unit._.get(feature) for feature in feature_list], dtype='object')
    
    _fv = np.array([np.reshape(feature, -1) for feature in fv], dtype='object')
    
    return np.concatenate(_fv)

In [8]:
# INPUTS 
essays = pd.read_csv("../data/output_csv/essays.csv")
adus = pd.read_csv("../data/output_csv/adus.csv")


In [126]:
# # TEST INPUT 

# data = [(row['text'], dict(id=row['essay_id'])) for ind, row in essays.iterrows()]
# docs = []
# data
# for doc, context in nlp.pipe(data, as_tuples=True):
#     doc._.essay_id = context['id']
#     docs.append(doc)
    

In [127]:
# # Flattening list of docs
# segmented_docs = [segmentation(doc, mode='sentence') for doc in docs]

# segmented_docs


# units = list(chain.from_iterable(segmented_docs))


In [128]:
# X_features = span_features[:-1]
# X_features

# X = [unit2fv(unit, X_features) for unit in units]
# y = [int(unit._.label) for unit in units]

In [60]:
# Pipelinev1

def text2fv(df):
    data = [(row['text'], dict(id=row['essay_id'])) for ind, row in df.iterrows()]
    docs = []
    data
    for doc, context in nlp.pipe(data, as_tuples=True):
        doc._.essay_id = context['id']
        docs.append(doc)
        
    segmented_docs = [segmentation(doc, mode='sentence') for doc in docs]

    units = list(chain.from_iterable(segmented_docs))
    
    X_features = span_features[:-1]

    X = np.array([unit2fv(unit, X_features) for unit in units])
    y = np.array([int(unit._.label) for unit in units])
    
    return X,y 

In [61]:
train = essays[essays['label'] =='train']
test =essays[essays['label'] =='test']

X_train, y_train = text2fv(train)

X_test, y_test = text2fv(test)

#fv_all = text2fv(essays)

In [41]:
from sklearn.model_selection import train_test_split


In [51]:
X_train, X_test, y_train, y_test = train_test_split( fv_all[0], fv_all[1], test_size=0.33, random_state=42)

In [62]:
X_train.shape

(5442, 305)

In [63]:
X_test.shape

(1384, 305)

In [120]:
# CONTEXTUAL TEST
input_text = essays['text'].iloc[0]

doc = nlp(input_text)

units = segmentation(doc)

def seg_test(doc):
    Span.set_extension('index_in_doc', default=None, force=True)
    sents = []
    
    for ind, s in enumerate(doc.sents):
        s._.index_in_doc = ind
        sents.append(s)
    return sents

In [194]:
# SAVE (GET NEIGHBOUR SENT. INDEX) ONLY FOR SENTENCE
sents[0].doc[sents[0][0].i -1].sent


Consequently, no matter from the view of individual development or the relationship between competition and cooperation we can receive the same conclusion that a more cooperative attitudes towards life is more profitable in one's success.

In [222]:
doc

Computer has negative effects to children

Nowadays, thanks to the development of technology, computer is now indispensable to life. Some people think that computer is good for children and it should be used daily by children but some others think differently. In my opinion, the latter opinion is true.
First, using computer constantly has bad influence on children's eyes. When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted.
Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life. It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences. For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a non-stop playing for 3 days.
Finally, even people who 

In [249]:
def get_is_head(span):
    
    if span[0].text == '\n':
        return 1
    
    # Previous span's last token index
    prev_span_ind = span[0].i - 1
    
    if span.doc[prev_span_ind].text == '\n':
        return 1
        
    return 0
    
    
    

In [250]:
Span.set_extension("is_head", getter = get_is_head, force=True)

In [275]:
doc = nlp(in_text)

units = seg_test(doc)

In [276]:
units_fixed = [s for s in units if not s.text.isspace()]

for s in units_fixed:
    print(s._.is_head,"----",s, "\n")

1 ---- Computer has negative effects to children

Nowadays, thanks to the development of technology, computer is now indispensable to life. 

0 ---- Some people think that computer is good for children and it should be used daily by children but some others think differently. 

0 ---- In my opinion, the latter opinion is true. 

1 ---- 
First, using computer constantly has bad influence on children's eyes. 

0 ---- When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted. 

1 ---- 
Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life. 

0 ---- It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences. 

0 ---- For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean

In [262]:
for s in units:
    print(s._.is_head,"----",s, "\n")

1 ---- Computer has negative effects to children

Nowadays, thanks to the development of technology, computer is now indispensable to life. 

0 ---- Some people think that computer is good for children and it should be used daily by children but some others think differently. 

0 ---- In my opinion, the latter opinion is true. 

1 ---- 
First, using computer constantly has bad influence on children's eyes. 

0 ---- When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted. 

1 ---- 
Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life. 

0 ---- It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences. 

0 ---- For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean

In [269]:
in_2 = add_full_stops(in_text)

doc2 = nlp(in_2)

units2 = seg_test(doc2)


for s in units:
    print(s._.is_head, "----", s)

1 ---- Computer has negative effects to children.
1 ---- 
Nowadays, thanks to the development of technology, computer is now indispensable to life.
0 ---- Some people think that computer is good for children and it should be used daily by children but some others think differently.
0 ---- In my opinion, the latter opinion is true..
First, using computer constantly has bad influence on children's eyes.
0 ---- When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted..
Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life.
0 ---- It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences.
0 ---- For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a no

In [240]:
ind = units[12][0].i

units[12].doc[ind-1] 



In [221]:
# IS HEAD TEST

in_text = essays['text'].iloc[23]

doc = nlp(in_text)

units = seg_test(doc)

units

[Computer has negative effects to children
 
 Nowadays, thanks to the development of technology, computer is now indispensable to life.,
 Some people think that computer is good for children and it should be used daily by children but some others think differently.,
 In my opinion, the latter opinion is true.,
 
 First, using computer constantly has bad influence on children's eyes.,
 When they concentrate on computer for too long, their eyes will get tired, which is the main reason for some eyes problems, typically shortsighted.,
 
 Moreover, children who play games too much on computer can seriously lack communicating skills, they will know little about the outside life.,
 It is a well-known fact that people who are addicted to games, especially online games, can eventually bear dangerous consequences.,
 For instance, several teenagers play games without rest, which leads to health depression, a typical example is the death of Korean gamer, who had a non-stop playing for 3 days.,
 
 

In [104]:
units[2]

In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers.

In [91]:
u1



It is always said that competition can effectively promote the development of economy.

# Classification

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [74]:
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)


LogisticRegression(solver='newton-cg')

In [78]:
preds_lr = logreg.predict(X_test)
print(classification_report(y_test, preds_lr))

              precision    recall  f1-score   support

           0       0.70      0.24      0.35       250
           1       0.85      0.98      0.91      1134

    accuracy                           0.84      1384
   macro avg       0.78      0.61      0.63      1384
weighted avg       0.83      0.84      0.81      1384



In [66]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [67]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [68]:
rf

RandomForestClassifier()

In [69]:
preds = rf.predict(X_test)

In [70]:
confusion_matrix(y_test, preds)

array([[  34,  216],
       [   9, 1125]], dtype=int64)

In [71]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.79      0.14      0.23       250
           1       0.84      0.99      0.91      1134

    accuracy                           0.84      1384
   macro avg       0.81      0.56      0.57      1384
weighted avg       0.83      0.84      0.79      1384



In [79]:
1125/(1125+216)

0.8389261744966443

In [80]:
1125/(1125+9)

0.9920634920634921

In [30]:
sum(preds)

5499

In [31]:
pd.Series(preds).value_counts()

1    5499
0    1327
dtype: int64

In [32]:
rf.fit(X_test, y_test)

RandomForestClassifier()

In [33]:
preds = rf.predict(X_train)

In [34]:
confusion_matrix(y_train, preds)

array([[1328,    2],
       [   1, 5495]], dtype=int64)

In [35]:
X_train.shape

(6826, 305)

In [36]:
X_test.shape

(6826, 305)

In [37]:
X_test

array([[-0.004337  ,  0.0057485 , -0.0807924 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.10655527,  0.28985757, -0.05341565, ...,  0.        ,
         0.        ,  0.        ],
       [-0.11575627,  0.22593297, -0.05616171, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.045301  ,  0.228496  , -0.23810712, ...,  0.        ,
         1.        ,  0.        ],
       [-0.07727963,  0.26625875, -0.17565002, ...,  0.        ,
         0.        ,  0.        ],
       [-0.02388383,  0.24468371, -0.1548357 , ...,  0.        ,
         1.        ,  0.        ]])