In [234]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import os
import re
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit

In [235]:
nlp = spacy.load("en_core_web_md")
def get_num_puncts(txt):
    return sum([tok.is_punct for tok in txt])

def get_len(txt):
    return len(txt)

Span.set_extension("num_puncts", getter = get_num_puncts, force=True)
Span.set_extension("length", getter = get_len, force=True)

def get_num_puncts2(doc):
    return [s._.num_puncts for s in doc.sents]
def get_len2(doc):
    return [s._.length for s in doc.sents]


Doc.set_extension("len2", getter=get_len2,force=True)
Doc.set_extension("punct2", getter=get_num_puncts2,force=True)
Doc.set_extension("Features", default =[], force=True)


In [236]:
adu_df = pd.read_csv("../data/output_csv/adus.csv")

# Loop through all essays

In [237]:
df = pd.DataFrame()
for file in os.listdir("../data/input/brat-project-final/"):
    
    if file.endswith(".txt"):
        with open("../data/input/brat-project-final/" +file , encoding="utf-8") as f:

            essay_text = f.read()
            
            doc = nlp(essay_text)
            sents = [sent for sent in doc.sents]
            doc._.Features = list(zip(doc._.len2, doc._.punct2))
            
            temp_df = pd.DataFrame(doc._.Features)
            temp_df.columns = ['length', 'num_puncts']
            temp_df['sentence'] = [s.text for s in sents]

            temp_df['label'] = 0
            essay_adus = adu_df['sentence'][adu_df['essay_id'] == file.strip('.txt')]
            for adu in essay_adus:
                try:
                    temp_df.loc[[adu in s.text for s in sents].index(True), 'label'] = 1
                except:
                    print(file, "has unsolved issues")
                

                
            temp_df['essay_id'] = file.strip(".txt")
            df = pd.concat([df, temp_df], axis=0)

essay126.txt has unsolved issues
essay245.txt has unsolved issues
essay251.txt has unsolved issues
essay260.txt has unsolved issues
essay273.txt has unsolved issues
essay292.txt has unsolved issues
essay324.txt has unsolved issues
essay371.txt has unsolved issues


In [246]:
df

Unnamed: 0,length,num_puncts,sentence,label,essay_id
0,10,1,Should students be taught to compete or to coo...,0,essay001
1,15,1,\n\nIt is always said that competition can eff...,0,essay001
2,27,4,"In order to survive in the competition, compan...",0,essay001
3,33,4,"However, when we discuss the issue of competit...",0,essay001
4,21,2,"From this point of view, I firmly believe that...",1,essay001
...,...,...,...,...,...
13,21,2,"It will be good for children, because indirect...",1,essay402
14,17,2,That will make children getting lots of friend...,1,essay402
15,16,3,"Secondly, playing sport makes children getting...",1,essay402
16,1,0,\n,0,essay402


In [245]:
adu_df

Unnamed: 0,adu_label,ADU,essay_id,label,start_ind,end_ind,claim_type,sentence
0,T1,MajorClaim 503 575\twe should attach more impo...,essay001,train,503,575,MajorClaim,we should attach more importance to cooperatio...
1,T2,MajorClaim 2154 2231\ta more cooperative attit...,essay001,train,2154,2231,MajorClaim,a more cooperative attitudes towards life is m...
2,T3,"Claim 591 714\tthrough cooperation, children c...",essay001,train,591,714,Claim,"through cooperation, children can learn about ..."
3,T4,Premise 716 851\tWhat we acquired from team wo...,essay001,train,716,851,Premise,What we acquired from team work is not only ho...
4,T5,Premise 853 1086\tDuring the process of cooper...,essay001,train,853,1086,Premise,"During the process of cooperation, children ca..."
...,...,...,...,...,...,...,...,...
6084,T11,Premise 1275 1339\tindirectly they will learn ...,essay402,train,1275,1339,Premise,indirectly they will learn how to socialize ea...
6085,T12,Premise 1341 1388\tThat will make children get...,essay402,train,1341,1388,Premise,That will make children getting lots of friends
6086,T13,Premise 1393 1436\tthey can contribute positiv...,essay402,train,1393,1436,Premise,they can contribute positively to community
6087,T14,Premise 1448 1525\tplaying sport makes childre...,essay402,train,1448,1525,Premise,playing sport makes children getting healthy a...


In [238]:
df

Unnamed: 0,length,num_puncts,sentence,label,essay_id
0,10,1,Should students be taught to compete or to coo...,0,essay001
1,15,1,\n\nIt is always said that competition can eff...,0,essay001
2,27,4,"In order to survive in the competition, compan...",0,essay001
3,33,4,"However, when we discuss the issue of competit...",0,essay001
4,21,2,"From this point of view, I firmly believe that...",1,essay001
...,...,...,...,...,...
13,21,2,"It will be good for children, because indirect...",1,essay402
14,17,2,That will make children getting lots of friend...,1,essay402
15,16,3,"Secondly, playing sport makes children getting...",1,essay402
16,1,0,\n,0,essay402


In [239]:
X = df.drop(['sentence','label','essay_id'], axis=1)
y = df['label']


In [240]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify =y)

In [241]:
logreg = LogisticRegression()
rf = RandomForestClassifier()
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)



RandomForestClassifier()

In [242]:
logreg_pred = logreg.predict(X_test)
rf_pred = rf.predict(X_test)

In [243]:
confusion_matrix(logreg_pred, y_test)

array([[ 137,    0],
       [ 393, 1649]], dtype=int64)

In [244]:
confusion_matrix(rf_pred, y_test)

array([[ 161,   31],
       [ 369, 1618]], dtype=int64)

In [231]:
accuracy_score(rf_pred, y_test)

0.815052776502983

In [232]:
accuracy_score(logreg_pred, y_test)

0.8196420376319412

In [233]:
adu_df

Unnamed: 0,adu_label,ADU,essay_id,label,start_ind,end_ind,claim_type,sentence
0,T1,MajorClaim 503 575\twe should attach more impo...,essay001,train,503,575,MajorClaim,we should attach more importance to cooperatio...
1,T2,MajorClaim 2154 2231\ta more cooperative attit...,essay001,train,2154,2231,MajorClaim,a more cooperative attitudes towards life is m...
2,T3,"Claim 591 714\tthrough cooperation, children c...",essay001,train,591,714,Claim,"through cooperation, children can learn about ..."
3,T4,Premise 716 851\tWhat we acquired from team wo...,essay001,train,716,851,Premise,What we acquired from team work is not only ho...
4,T5,Premise 853 1086\tDuring the process of cooper...,essay001,train,853,1086,Premise,"During the process of cooperation, children ca..."
...,...,...,...,...,...,...,...,...
6084,T11,Premise 1275 1339\tindirectly they will learn ...,essay402,train,1275,1339,Premise,indirectly they will learn how to socialize ea...
6085,T12,Premise 1341 1388\tThat will make children get...,essay402,train,1341,1388,Premise,That will make children getting lots of friends
6086,T13,Premise 1393 1436\tthey can contribute positiv...,essay402,train,1393,1436,Premise,they can contribute positively to community
6087,T14,Premise 1448 1525\tplaying sport makes childre...,essay402,train,1448,1525,Premise,playing sport makes children getting healthy a...
