In [194]:
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import os
import re
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit

In [141]:
nlp = spacy.load("en_core_web_md")
def get_num_puncts(txt):
    return sum([tok.is_punct for tok in txt])

def get_len(txt):
    return len(txt)

Span.set_extension("num_puncts", getter = get_num_puncts, force=True)
Span.set_extension("length", getter = get_len, force=True)

def get_num_puncts2(doc):
    return [s._.num_puncts for s in doc.sents]
def get_len2(doc):
    return [s._.length for s in doc.sents]


Doc.set_extension("len2", getter=get_len2,force=True)
Doc.set_extension("punct2", getter=get_num_puncts2,force=True)
Doc.set_extension("Features", default =[], force=True)


In [23]:
adu_df = pd.read_csv("../data/output_csv/adus.csv")

# Loop through all essays

In [126]:
df = pd.DataFrame()
for file in os.listdir("../data/input/brat-project-final/"):
    
    if file.endswith(".txt"):
        with open("../data/input/brat-project-final/" +file , encoding="utf-8") as f:

            essay_text = f.read()
            
            doc = nlp(essay_text)
            sents = [sent for sent in doc.sents]
            doc._.Features = list(zip(doc._.len2, doc._.punct2))
            
            temp_df = pd.DataFrame(doc._.Features)
            temp_df.columns = ['length', 'num_puncts']
            temp_df['sentence'] = [s.text for s in sents]

            temp_df['label'] = 0
            essay_adus = adu_df['sentence'][adu_df['essay_id'] == file.strip('.txt')]
            for adu in essay_adus:
                try:
                    temp_df.loc[[adu in s.text for s in sents].index(True), 'label'] = 1
                except:
                    print(file, "has issues")
                    print(adu, file)

                
            temp_df['essay_id'] = file.strip(".txt")
            df = pd.concat([df, temp_df], axis=0)

essay126.txt has issues
We spend time together but we could never be friend essay126.txt
essay245.txt has issues
I get home at about midnight and I still need to answer some questions and to solve some routine problems essay245.txt
essay251.txt has issues
criticism is essential especially in group working but in other occasions we still need criticism essay251.txt
essay260.txt has issues
Take the most popular and prestigious technology company- Apple- for example essay260.txt
essay273.txt has issues
Every technology has it’s good and bad effects but I prefer having the technologies with their disadvantages to living like 16 century without any of modern facilities of technology essay273.txt
essay292.txt has issues
majority of the students failed in front of the computer games playing and they use their time for studying, sleeping and physical exerice to play these computer games essay292.txt
essay324.txt has issues
doing same task day and night just server to increase our psychological

In [124]:
df[df['essay_id'] == 'essay126']

Unnamed: 0,length,num_puncts,sentence,label,essay_id
0,35,1,One or two close friends vs. a large number of...,0,essay126
1,26,2,"However, I still agree that it's better to hav...",0,essay126
2,18,1,\nWe can not deny that being a friend with som...,0,essay126
3,12,2,"First, it's not easy to make friend with anyone.",0,essay126
4,13,1,I believe friends are people who have great co...,0,essay126
...,...,...,...,...,...
17,12,1,And I believe my friend will do the same to me.,1,essay126
18,12,2,"In conclusion, a friend is a person you care a...",1,essay126
19,18,2,"\nTo me, friends are the most important people...",1,essay126
20,27,2,I am the kind of person who has only one or tw...,0,essay126


In [195]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)

In [212]:
temp = df.drop(['sentence','essay_id'], axis=1).copy().reset_index(drop=True)

In [213]:
temp

Unnamed: 0,length,num_puncts,label
0,10,1,0
1,15,1,0
2,27,4,0
3,33,4,0
4,21,2,1
...,...,...,...
7256,21,2,1
7257,17,2,1
7258,16,3,1
7259,1,0,0


In [216]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(temp, temp['label']):
    strat_train_test = temp.loc[train_index]
    strat_test_set = temp.loc[test_index]

In [218]:
strat_test_set['label'].value_countss()

1    1099
0     354
Name: label, dtype: int64

In [219]:
strat_train_test['label'].value_counts()

1    4395
0    1413
Name: label, dtype: int64

In [200]:
for train, test in split.split(X, y):
    print(train, test)

[2436  295 4196 ... 4580 2845 1012] [3002  535 3536 ...   95 6876 1939]


In [130]:
X = df.drop(['sentence','label','essay_id'], axis=1)
y = df['label']


In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify =y)

In [192]:
y_train.value_counts()

1    3845
0    1237
Name: label, dtype: int64

In [193]:
y_test.value_counts()

1    1649
0     530
Name: label, dtype: int64

In [132]:
logreg = LogisticRegression()
rf = RandomForestClassifier()
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)



RandomForestClassifier()

In [133]:
logreg_pred = logreg.predict(X_test)
rf_pred = rf.predict(X_test)

In [134]:
confusion_matrix(logreg_pred, y_test)

array([[ 143,    0],
       [ 365, 1671]], dtype=int64)

In [135]:
confusion_matrix(rf_pred, y_test)

array([[ 169,   34],
       [ 339, 1637]], dtype=int64)

In [136]:
accuracy_score(rf_pred, y_test)

0.8288205598898577

In [137]:
accuracy_score(logreg_pred, y_test)

0.8324919687930243

In [175]:
test_doc = nlp(test_text)

test_doc._.Features = list(zip(test_doc._.len2, test_doc._.punct2))

In [176]:
logreg.predict(test_doc._.Features)

array([1], dtype=int64)

In [177]:
rf.predict(test_doc._.Features)

array([1], dtype=int64)

In [182]:
df.label.value_counts()

1    5494
0    1767
Name: label, dtype: int64

In [None]:
t1 = "I believe drinking water" # 0 1
t2 = "believe drinking water is " # 0 1
t3 = "drinking water is good for your" # 0 1
t4 = "water is good for your health." # 0 1

want = "drinking water is good for your health." # 1

In [None]:
test = "I believe drinking water is good for your health."  # 0 or 1
