In [271]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

In [6]:
nlp = spacy.load("en_core_web_md")

# Test with only essay001

In [282]:
essay_input_path = "../data/input/brat-project-final/essay126.txt"
with open(essay_input_path) as f:
    essay_text = f.read()
    
ann_input_path  = "../data/input/brat-project-final/essay126.ann"
with open(ann_input_path) as f:
    ann_text = f.read()
    


In [285]:
essay_text

"One or two close friends vs. a large number of casual acquaintances\n\nThere is no need for me to put any emphasis on the advantage of having a large number of casual acquaintance. However, I still agree that it's better to have one or two close friends than to have a large number of casual acquaintances.\nWe can not deny that being a friend with someone is different from only being acquaintant. First, it's not easy to make friend with anyone. I believe friends are people who have great compatibility with each other. After two years in college I have found sometimes there is more differences between people than similarities, for we are growing in the different families and was educated in various ways. Sometimes acquaintances are just acquaintances. We spend time together but we could never be friend. So, it's much easier to have some casual acquaintances than have some close friends.\nSecond, close friends can easily understand each others feelings. Because of the similarity I talked

In [284]:
print(ann_text)

T1	Claim 814 893	it's much easier to have some casual acquaintances than have some close friends
A1	Stance T1 For
T2	MajorClaim 207 303	it's better to have one or two close friends than to have a large number of casual acquaintances
T3	Premise 326 394	being a friend with someone is different from only being acquaintant
T4	Premise 403 443	it's not easy to make friend with anyone
T5	Premise 455 518	friends are people who have great compatibility with each other
T6	Premise 520 707	After two years in college I have found sometimes there is more differences between people than similarities, for we are growing in the different families and was educated in various ways
T7	Premise 709 755	Sometimes acquaintances are just acquaintances
T8	Premise 757 808	We spend time together but we could never be friend
T9	Claim 903 959	close friends can easily understand each others feelings
A2	Stance T9 For
T10	Premise 1004 1078	acquaintances probably not able to understand you better than your friends
T11	

In [280]:
doc = nlp(essay_text)

In [281]:
doc

One or two close friends vs. a large number of casual acquaintances

There is no need for me to put any emphasis on the advantage of having a large number of casual acquaintance. However, I still agree that it's better to have one or two close friends than to have a large number of casual acquaintances.
We can not deny that being a friend with someone is different from only being acquaintant. First, it's not easy to make friend with anyone. I believe friends are people who have great compatibility with each other. After two years in college I have found sometimes there is more differences between people than similarities, for we are growing in the different families and was educated in various ways. Sometimes acquaintances are just acquaintances. We spend time together but we could never be friend. So, it's much easier to have some casual acquaintances than have some close friends.
Second, close friends can easily understand each others feelings. Because of the similarity I talked befo

# Test with a few features

In [122]:
def get_num_puncts(txt):
    return sum([tok.is_punct for tok in txt])

def get_len(txt):
    return len(txt)

Span.set_extension("num_puncts", getter = get_num_puncts, force=True)
Span.set_extension("length", getter = get_len, force=True)

In [None]:
def get_num_puncts2(doc):
    return [s._.num_puncts for s in doc.sents]
def get_len2(doc):
    return [s._.length for s in doc.sents]


Doc.set_extension("len2", getter=get_len2,force=True)
Doc.set_extension("punct2", getter=get_num_puncts2,force=True)

In [161]:
sents[0]._.length

10

In [162]:
doc._.len2

[10, 15, 27, 33, 21, 25, 28, 42, 22, 22, 13, 24, 12, 62, 14, 13, 39]

In [127]:
sents = [sent for sent in doc.sents]

In [None]:
first_list = list(
    zip(
        doc._.cl_indicators,
        doc._.pr_indicators,
        doc._.punctcount,
        doc._.questions,
        doc._.personals,
        doc._.modals,
        doc._.tree_depths,
        doc._.prodcount,
        doc._.sentence_positions,
        doc._.sentlengths,
    )
)

In [164]:
first_list = list(zip(doc._.len2, doc._.punct2))
first_list

[(10, 1),
 (15, 1),
 (27, 4),
 (33, 4),
 (21, 2),
 (25, 3),
 (28, 2),
 (42, 5),
 (22, 1),
 (22, 2),
 (13, 1),
 (24, 3),
 (12, 2),
 (62, 4),
 (14, 1),
 (13, 2),
 (39, 2)]

In [190]:
Doc.set_extension("Features", default =[])

In [193]:
doc._.Features = first_list

In [194]:
doc._.Features

[(10, 1),
 (15, 1),
 (27, 4),
 (33, 4),
 (21, 2),
 (25, 3),
 (28, 2),
 (42, 5),
 (22, 1),
 (22, 2),
 (13, 1),
 (24, 3),
 (12, 2),
 (62, 4),
 (14, 1),
 (13, 2),
 (39, 2)]

In [195]:
essay_text

'International tourism is now more common than ever before\n\nThe last 50 years have seen a significant increase in the number of tourist traveling worldwide. While some might think the tourism bring large profit for the destination countries, I would contend that this industry has affected the cultural attributes and damaged the natural environment of the tourist destinations.\nFirstly, it is an undeniable fact that tourists from different cultures will probably cause changes to the cultural identity of the tourist destinations. Take Thailand for example, in the Vietnam War, many American soldiers came to Thailand for a break and involved in sexual and drug activities, these huge demands caused many local businesses opened and expanded, even illegally involved in under-age prostitutes to maximize their profits. This was due to the lack of adequate controls by authorities and lead to a bad image of Thailand tourism. Therefore this proves that international tourism can create negative i

In [None]:
adu_df = pd.read_csv("../data/output_csv/adus.csv")

In [263]:
df = pd.DataFrame(first_list)
df.columns = ['length', 'num_puncts']
df['sentence'] = [s.text for s in sents]

df['label'] = 0
essay1_adus = adu_df['sentence'][adu_df['essay_id'] == 'essay001']
for adu in essay1_adus:
    df.loc[[adu in s.text for s in sents].index(True), 'label'] = 1
df

Unnamed: 0,length,num_puncts,sentence,label
0,10,1,Should students be taught to compete or to coo...,0
1,15,1,\n\nIt is always said that competition can eff...,0
2,27,4,"In order to survive in the competition, compan...",0
3,33,4,"However, when we discuss the issue of competit...",0
4,21,2,"From this point of view, I firmly believe that...",1
5,25,3,"\nFirst of all, through cooperation, children ...",1
6,28,2,What we acquired from team work is not only ho...,1
7,42,5,"During the process of cooperation, children ca...",1
8,22,1,All of these skills help them to get on well w...,1
9,22,2,"\nOn the other hand, the significance of compe...",1


In [264]:
X = df.drop(['sentence', 'label'], axis=1)
y = df['label']

# Training

In [257]:
logreg = LogisticRegression()
rf = RandomForestClassifier()

In [268]:
logreg.fit(X,y)
rf.fit(X,y)

RandomForestClassifier()

In [273]:
logreg_pred = logreg.predict(X)
logreg_pred

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1], dtype=int64)

In [274]:
rf_pred = rf.predict(X)
rf_pred

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1], dtype=int64)

In [275]:
confusion_matrix(rf_pred, y)

array([[ 6,  0],
       [ 0, 11]], dtype=int64)

In [278]:
confusion_matrix(logreg_pred, y)

array([[2, 2],
       [4, 9]], dtype=int64)

In [155]:
doc._.len2

[10, 15, 27, 33, 21, 25, 28, 42, 22, 22, 13, 24, 12, 62, 14, 13, 39]

In [156]:
doc._.punct2

[1, 1, 4, 4, 2, 3, 2, 5, 1, 2, 1, 3, 2, 4, 1, 2, 2]

In [None]:
first_list = zip()

In [145]:
l1 = [1,2,3,4,5]
l2 = ['a','b','c','d','e']

list(zip(l1,l2))

[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')]