In [3]:
import joblib
import pandas as pd
import json
import numpy as np
from create_text_error import create_typos_from_text, replace_sampled_prepositions, break_sva

In [4]:
data = pd.read_csv('ASAP7 Train Set.tsv', sep='\t')
data.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19441,Patience is tough to achieve sometimes. But th...,22
1,18068,Patience a helpful trait to get you through li...,24
2,19000,There were many of times when I was patient. T...,18
3,18339,A time I was patiet was when I was wating for ...,24
4,18297,Do you ever fell like your @CAPS1 is going to ...,21


In [5]:
# Sample 100 data to get corrections from OpenAI
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19441,Patience is tough to achieve sometimes. But th...,22
1,18068,Patience a helpful trait to get you through li...,24
2,19000,There were many of times when I was patient. T...,18
3,18339,A time I was patiet was when I was wating for ...,24
4,18297,Do you ever fell like your @CAPS1 is going to ...,21


In [6]:
# Test with one essay sample
essay = data_100.iloc[20]['essay']

def get_text_error(text):
    typos = create_typos_from_text(text)
    preps = replace_sampled_prepositions(text, 2)
    svas = break_sva(text)
    return typos + preps + svas

In [7]:
get_text_error(essay)

[{'original': 'patient', 'error': 'aptient', 'sentence': 1},
 {'original': 'got', 'error': 'gto', 'sentence': 2},
 {'original': 'thought', 'error': 'thoguht', 'sentence': 3},
 {'original': 'wait', 'error': 'awit', 'sentence': 4},
 {'original': 'patient', 'error': 'patinet', 'sentence': 5},
 {'original': 'with', 'error': 'about', 'sentence': 5},
 {'original': 'In', 'error': 'except', 'sentence': 3},
 {'original': 'know', 'error': 'knew', 'sentence': 14},
 {'original': 'remember', 'error': 'remembers', 'sentence': 1},
 {'original': 'ask', 'error': 'asked', 'sentence': 7},
 {'original': 'work', 'error': 'wrought', 'sentence': 13},
 {'original': 'have', 'error': 'had', 'sentence': 9}]

In [8]:
# Initialize an empty list to store the JSON responses
error_lists = []

ctr = 0

# Iterate through each row in the DataFrame
for index, row in data_100.iterrows():
    try:
        # Extract the essay text
        essay = row['essay']
        # Get the error response
        error_list = get_text_error(essay)
        # Append the result to the list
        error_lists.append(error_list)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        error_lists.append(None)  # Append None or an empty dict if an error occurs
    ctr = ctr + 1
    print(f"Finished processing essay {ctr}")

Finished processing essay 1
Finished processing essay 2
Finished processing essay 3
Finished processing essay 4
Finished processing essay 5
Finished processing essay 6
Finished processing essay 7
Finished processing essay 8
Finished processing essay 9
Finished processing essay 10
Finished processing essay 11
Finished processing essay 12
Finished processing essay 13
Finished processing essay 14
Finished processing essay 15
Finished processing essay 16
Finished processing essay 17
Finished processing essay 18
Finished processing essay 19
Finished processing essay 20
Finished processing essay 21
Finished processing essay 22
Finished processing essay 23
Finished processing essay 24
Finished processing essay 25
Finished processing essay 26
Finished processing essay 27
Finished processing essay 28
Finished processing essay 29
Finished processing essay 30
Finished processing essay 31
Finished processing essay 32
Finished processing essay 33
Finished processing essay 34
Finished processing ess

In [9]:
len(error_lists)

100

In [10]:
error_array = np.array(error_lists, dtype=object)
print(error_array.shape)

(100,)


In [11]:
joblib.dump(error_array, 'files/error_lists_train_100')

['files/error_lists_train_100']

#### Get scores from 100 sampled essay (train set)

In [12]:
# Collect scores from sampled testing set
scores = data_100['domain1_score']
joblib.dump(scores, "files/scores_essay_100_train_ori")

['files/scores_essay_100_train_ori']

#### Try Grammar Error : Subject-Verb disagreement

In [18]:
import re
import random
import nltk
import json
from nltk.corpus import words, stopwords
from nltk.tokenize import sent_tokenize
import spacy
import pyinflect   

In [19]:
def print_sentences_with_numbers(text):
    doc = nlp(text)
    for i, sent in enumerate(doc.sents, start=1):
        print(f"Sentence {i}: {sent.text}")

def load_spacy_model(model_name="en_core_web_sm"):
    """Load the spaCy model."""
    return spacy.load(model_name)

In [20]:
# Load the spaCy model once
nlp = load_spacy_model()

In [27]:
# Minimal irregular fallback map for very common verbs
IRREGULAR = {
    ("go", "VBZ"): "goes",
    ("go", "VBD"): "went",
    ("go", "VBN"): "gone",
    ("be", "VBZ"): "is",
    ("be", "VBP"): "are",
    ("be", "VBD"): "was",
    ("be", "VBN"): "been",
    ("have", "VBZ"): "has",
    ("have", "VBD"): "had",
    ("do", "VBZ"): "does",
    ("do", "VBD"): "did",
}

def inflect_form(lemma, target_tag):
    """
    Try pyinflect first; if that fails, fall back to IRREGULAR map
    then to naive suffix rules.
    """
    # pyinflect wants a spaCy token; make a dummy one
    dummy = nlp(lemma)[0]
    inflected = dummy._.inflect(target_tag)
    if inflected:
        return inflected
    
    # fallback to irregular dictionary
    if (lemma, target_tag) in IRREGULAR:
        return IRREGULAR[(lemma, target_tag)]
    
    # crude suffix fall-back (only for regular verbs)
    if target_tag == "VBZ":
        return lemma + "s"
    if target_tag == "VBD":
        return lemma + "ed"
    if target_tag == "VBP":
        return lemma                    # base already
    return None                         # give up

    
def break_sva(text, n_err=5, nlp=nlp):
    """
    Inject exactly `n_err` subjectâ€“verb agreement errors using correct inflections.
    Prints a warning if fewer than `n_err` errors are possible.
    """
    if isinstance(text, str):
        doc = nlp(text)
    else:
        doc = text

    # Collect all candidate (verb, sentence) pairs
    candidates = []
    for sent_no, sent in enumerate(doc.sents, start=1):
        for tok in sent:
            if tok.dep_ == "ROOT" and tok.pos_ == "VERB":
                if any(w.dep_ in ("nsubj", "nsubjpass") for w in tok.lefts):
                    candidates.append((tok, sent_no))

    random.shuffle(candidates)
    errors = []

    for tok, sent_no in candidates:
        if len(errors) >= n_err:
            break

        lemma = tok.lemma_.lower()
        tag   = tok.tag_

        if tag == "VBZ":
            target_tag = random.choice(["VBP", "VBD"])
        elif tag == "VBP":
            target_tag = random.choice(["VBZ", "VBD"])
        elif tag == "VBD":
            target_tag = random.choice(["VBZ", "VBP"])
        elif tag in ("VBG", "VBN"):
            target_tag = random.choice(["VBZ", "VBP"])
        else:
            continue

        wrong_form = inflect_form(lemma, target_tag)
        if not wrong_form:
            continue
        if wrong_form.lower() == tok.text.lower():
            continue

        errors.append({
            "original": tok.text,
            "error": wrong_form,
            "sentence": sent_no
        })

    # Fallback warning
    if len(errors) < n_err:
        print(f"[Warning] Only {len(errors)} errors could be generated (requested {n_err}).")

    return errors

In [28]:
text = data_100.iloc[1]['essay']
text

"People using computers is a good way for them to talk with distant relatives or friends, learn keyboarding skills, and research about far-away places. First, the wonder ful technology that computers have is amazing. They can surf the web in milli-seconds. This benefits people because if you were doing a research paper on china, many search engines let you surf the web at blazing fast speed, and in a little time, you could have all the information you needed. Scientists say that @PERCENT1 of computer-users have fast internet, and they reccommend it to other, first-time buyers. The computer that I use is a unique computer that was customizable to my liking. I like playing games and editing videos, so I got an outstanding graphics car that has outstanding speeds for rendering videos. This let me play games and edit videos, and my @NUM1 gb hard drive is capable of storing all my programs and projects, for future use. Doing many things on the computer can really help your hand-eye coordina

In [29]:
break_sva(text)

[{'original': 'have', 'error': 'had', 'sentence': 18},
 {'original': 'helps', 'error': 'help', 'sentence': 17},
 {'original': 'let', 'error': 'lets', 'sentence': 8},
 {'original': 'got', 'error': 'get', 'sentence': 7},
 {'original': 'say', 'error': 'says', 'sentence': 5}]

In [25]:
print_sentences_with_numbers(text)

Sentence 1: Computers and the @CAPS1 were a technological break through.
Sentence 2: It exposed to the average world, things that were never thought possitive.
Sentence 3: But as these things advanced over the years, they've become an addiction so bad of an addiction its begun to threaten peoples lives I've been given a choice to s'de with the addicting computers, or to offose them.
Sentence 4: The only clear choice is to offose.
Sentence 5: First off, computers have caused the world a decrease in exercise.
Sentence 6: Studies show @NUM1 out of @NUM2 people who use a computer, do not exercise with less exercise throughout the world, nations are becoming more over weight.
Sentence 7: This is a huge problem in the united states computers are main cause to why the @CAPS2.S is over weight and unhealthy by cutting down computers use, we can get our world back into great shape we can bring exercise and health back.
Sentence 8: Nextly, I'm sure you've all heard of online predators.
Sentence 9