In [1]:
import joblib
import pandas as pd
import nltk
import re
import itertools
import random
import numpy as np

In [2]:
data = pd.read_csv('ASAP7 Train Set.tsv', sep='\t')
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19441,Patience is tough to achieve sometimes. But th...,22
1,18068,Patience a helpful trait to get you through li...,24
2,19000,There were many of times when I was patient. T...,18
3,18339,A time I was patiet was when I was wating for ...,24
4,18297,Do you ever fell like your @CAPS1 is going to ...,21


In [4]:
errors = joblib.load('files/error_lists_train_100')

In [4]:
len(errors)

100

In [5]:
# Function to sample 5 dictionaries from each inner list
def sample_five_from_each(inner_lists):
    return [random.sample(inner_list, min(len(inner_list), 5)) for inner_list in inner_lists]

# Sample 5 dictionaries from each inner list
sampled_errors = sample_five_from_each(errors)

print(np.array(sampled_errors).shape)

# Print the results
for idx, sampled_list in enumerate(sampled_errors):
    print(f"Sampled data for list {idx + 1}:")
    for item in sampled_list:
        print(item)
    print()


(100, 5)
Sampled data for list 1:
{'original': 'rushed', 'error': 'rush', 'sentence': 16}
{'original': 'up', 'error': 'inside', 'sentence': 14}
{'original': 'won', 'error': 'win', 'sentence': 43}
{'original': 'stood', 'error': 'stand', 'sentence': 40}
{'original': 'impatient', 'error': 'impateint', 'sentence': 2}

Sampled data for list 2:
{'original': 'of', 'error': 'among', 'sentence': 4}
{'original': 'learned', 'error': 'learn', 'sentence': 32}
{'original': 'patient', 'error': 'patinet', 'sentence': 5}
{'original': 'helpful', 'error': 'ehlpful', 'sentence': 1}
{'original': 'person', 'error': 'persno', 'sentence': 3}

Sampled data for list 3:
{'original': 'getting', 'error': 'gets', 'sentence': 3}
{'original': 'had', 'error': 'have', 'sentence': 8}
{'original': 'told', 'error': 'tells', 'sentence': 11}
{'original': 'patient', 'error': 'patinet', 'sentence': 4}
{'original': 'game', 'error': 'agme', 'sentence': 3}

Sampled data for list 4:
{'original': 'school', 'error': 'school', 'sent

In [6]:
np.array(sampled_errors).shape

(100, 5)

In [7]:
joblib.dump(sampled_errors, 'files/error_train_100essay_5err')

['files/error_train_100essay_5err']

In [8]:
# Function to apply corrections to the essay
def apply_errors(essay, errors):
    # Tokenize the essay into sentences using NLTK
    sentences = nltk.sent_tokenize(essay)

    # Create a list to store corrected sentences
    errored_sentences = []

    # Iterate through sentences
    for i, sentence in enumerate(sentences):
        # Apply corrections to the current sentence
        for error in errors:
            # Adjust the index by subtracting 1 to match the enumeration starting from 0
            if error['sentence'] == i + 1:
                # Replace all occurrences of the incorrect text with the correction
                #sentence = sentence.replace(correction['original'], correction['correction'])
                 # Use regular expression with word boundaries to replace only whole words
                sentence = re.sub(r'\b' + re.escape(error['original']) + r'\b', error['error'], sentence)
        # Add the corrected sentence to the list
        errored_sentences.append(sentence)

    # Join the corrected sentences to form the final essay
    errored_essay = ' '.join(errored_sentences)
    return errored_essay

# Function to generate all combinations of corrections and apply them
def generate_errored_versions(essay, errors):
    errored_versions = []
    max_errors = len(errors)

    # Iterate over different numbers of corrections to apply
    for num_errors in range(1, max_errors + 1):
        for combination in itertools.combinations(errors, num_errors):
            errored_essay = apply_errors(essay, combination)
            errored_versions.append(errored_essay)
  
    return errored_versions

# Example usage
essay = "This is a tess essay. The break through was amazing. The possitive effects were clear."
errors = [
    {'original': 'tess', 'error': 'test', 'type': 'Spelling', 'sentence': 1},
    {'original': 'break through', 'error': 'breakthrough', 'type': 'Spelling', 'sentence': 2},
    {'original': 'possitive', 'error': 'positive', 'type': 'Spelling', 'sentence': 3},
]

# Generate corrected versions with combinations of 1, 2, and 3 corrections
errored_versions = generate_errored_versions(essay, errors)
errored_versions

['This is a test essay. The break through was amazing. The possitive effects were clear.',
 'This is a tess essay. The breakthrough was amazing. The possitive effects were clear.',
 'This is a tess essay. The break through was amazing. The positive effects were clear.',
 'This is a test essay. The breakthrough was amazing. The possitive effects were clear.',
 'This is a test essay. The break through was amazing. The positive effects were clear.',
 'This is a tess essay. The breakthrough was amazing. The positive effects were clear.',
 'This is a test essay. The breakthrough was amazing. The positive effects were clear.']

In [9]:
error_train_100essay_5err = joblib.load('files/error_train_100essay_5err')

errored_train_100data_5err = []

for index, row in data_100.iterrows():
    try:
        errored_essays = generate_errored_versions(row['essay'], error_train_100essay_5err[index]) 
        # Append the result to the list
        errored_train_100data_5err.append(errored_essays)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        errored_train_100data_5err.append(None)  # Append None or an empty dict if an error occurs
    print(index + 1, " finished")

1  finished
2  finished
3  finished
4  finished
5  finished
6  finished
7  finished
8  finished
9  finished
10  finished
11  finished
12  finished
13  finished
14  finished
15  finished
16  finished
17  finished
18  finished
19  finished
20  finished
21  finished
22  finished
23  finished
24  finished
25  finished
26  finished
27  finished
28  finished
29  finished
30  finished
31  finished
32  finished
33  finished
34  finished
35  finished
36  finished
37  finished
38  finished
39  finished
40  finished
41  finished
42  finished
43  finished
44  finished
45  finished
46  finished
47  finished
48  finished
49  finished
50  finished
51  finished
52  finished
53  finished
54  finished
55  finished
56  finished
57  finished
58  finished
59  finished
60  finished
61  finished
62  finished
63  finished
64  finished
65  finished
66  finished
67  finished
68  finished
69  finished
70  finished
71  finished
72  finished
73  finished
74  finished
75  finished
76  finished
77  finished
78  fini

In [10]:
np.array(errored_train_100data_5err).shape

(100, 31)

In [11]:
joblib.dump(errored_train_100data_5err, 'files/errored_train_100data_5err')

['files/errored_train_100data_5err']