In [1]:
import joblib
import pandas as pd
import nltk
import re
import itertools
import numpy as np
import random

In [2]:
data = pd.read_csv('ASAP7 Test Set.tsv', sep='\t')
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19392,My at saying your patient is. Your patient whe...,12
1,18926,Once upon a time there was a young girl named ...,18
2,18318,Patience is hard. You always @CAPS1 it now not...,14
3,18904,"Patience, @CAPS6 essential part of our world. ...",14
4,19131,One time I was pacient was when my mum said th...,8


In [3]:
corrections = joblib.load('files/json_responses_test_100')
corrections

[[{'original': 'at saying',
   'correction': 'way of saying',
   'type': 'Word Choice',
   'sentence': 1},
  {'original': 'patience',
   'correction': 'patient',
   'type': 'Word Choice',
   'sentence': 3},
  {'original': 'queiter',
   'correction': 'quarter',
   'type': 'Spelling',
   'sentence': 3},
  {'original': 'had patience',
   'correction': 'have patience',
   'type': 'Grammatical',
   'sentence': 3},
  {'original': 'patient',
   'correction': 'patience',
   'type': 'Word Choice',
   'sentence': 4},
  {'original': 'the yell',
   'correction': 'to yell',
   'type': 'Grammatical',
   'sentence': 4},
  {'original': 'their',
   'correction': "they're",
   'type': 'Grammatical',
   'sentence': 5},
  {'original': "wern't",
   'correction': "weren't",
   'type': 'Spelling',
   'sentence': 5},
  {'original': 'patient',
   'correction': 'patience',
   'type': 'Word Choice',
   'sentence': 5},
  {'original': 'Am not',
   'correction': 'I am not',
   'type': 'Grammatical',
   'sentence': 

In [4]:
# Function to sample 5 dictionaries from each inner list
def sample_five_from_each(inner_lists):
    return [random.sample(inner_list, min(len(inner_list), 5)) for inner_list in inner_lists]

# Sample 5 dictionaries from each inner list
sampled_corrections = sample_five_from_each(corrections)

#print(np.array(sampled_corrections).shape)

# Print the results
for idx, sampled_list in enumerate(sampled_corrections):
    print(f"Sampled data for list {idx + 1}:")
    for item in sampled_list:
        print(item)
    print()


Sampled data for list 1:
{'original': 'the yell', 'correction': 'to yell', 'type': 'Grammatical', 'sentence': 4}
{'original': 'patient', 'correction': 'patience', 'type': 'Word Choice', 'sentence': 4}
{'original': 'Am not', 'correction': 'I am not', 'type': 'Grammatical', 'sentence': 6}
{'original': "wern't", 'correction': "weren't", 'type': 'Spelling', 'sentence': 5}
{'original': 'their', 'correction': "they're", 'type': 'Grammatical', 'sentence': 5}

Sampled data for list 2:
{'original': 'sience', 'correction': 'since', 'type': 'Spelling', 'sentence': 4}
{'original': 'there cows', 'correction': 'their cows', 'type': 'Grammatical', 'sentence': 5}
{'original': 'exited', 'correction': 'excited', 'type': 'Spelling', 'sentence': 3}
{'original': 'stuburn', 'correction': 'stubborn', 'type': 'Spelling', 'sentence': 6}
{'original': 'here luckey', 'correction': 'her lucky', 'type': 'Spelling', 'sentence': 4}

Sampled data for list 3:
{'original': 'theres', 'correction': "there's", 'type': 'Gra

In [5]:
sampled_corrections

[[{'original': 'the yell',
   'correction': 'to yell',
   'type': 'Grammatical',
   'sentence': 4},
  {'original': 'patient',
   'correction': 'patience',
   'type': 'Word Choice',
   'sentence': 4},
  {'original': 'Am not',
   'correction': 'I am not',
   'type': 'Grammatical',
   'sentence': 6},
  {'original': "wern't",
   'correction': "weren't",
   'type': 'Spelling',
   'sentence': 5},
  {'original': 'their',
   'correction': "they're",
   'type': 'Grammatical',
   'sentence': 5}],
 [{'original': 'sience',
   'correction': 'since',
   'type': 'Spelling',
   'sentence': 4},
  {'original': 'there cows',
   'correction': 'their cows',
   'type': 'Grammatical',
   'sentence': 5},
  {'original': 'exited',
   'correction': 'excited',
   'type': 'Spelling',
   'sentence': 3},
  {'original': 'stuburn',
   'correction': 'stubborn',
   'type': 'Spelling',
   'sentence': 6},
  {'original': 'here luckey',
   'correction': 'her lucky',
   'type': 'Spelling',
   'sentence': 4}],
 [{'original': 

In [6]:
joblib.dump(sampled_corrections, 'files/correction_test_100essay_5corr')

['files/correction_test_100essay_5corr']

In [7]:
# Function to apply corrections to the essay
def apply_corrections_grammar(essay, corrections):
    # Tokenize the essay into sentences using NLTK
    sentences = nltk.sent_tokenize(essay)

    # Create a list to store corrected sentences
    corrected_sentences = []

    # Iterate through sentences
    for i, sentence in enumerate(sentences):
        # Apply corrections to the current sentence
        for correction in corrections:
            # Adjust the index by subtracting 1 to match the enumeration starting from 0
            if correction['sentence'] == i + 1:
                # Replace all occurrences of the incorrect text with the correction
                #sentence = sentence.replace(correction['original'], correction['correction'])
                 # Use regular expression with word boundaries to replace only whole words
                sentence = re.sub(r'\b' + re.escape(correction['original']) + r'\b', correction['correction'], sentence)
        # Add the corrected sentence to the list
        corrected_sentences.append(sentence)

    # Join the corrected sentences to form the final essay
    corrected_essay = ' '.join(corrected_sentences)
    return corrected_essay

# Function to generate all combinations of corrections and apply them
def generate_corrected_versions(essay, corrections):
    corrected_versions = []
    max_corrections = len(corrections)

    # Iterate over different numbers of corrections to apply
    for num_corrections in range(1, max_corrections + 1):
        for combination in itertools.combinations(corrections, num_corrections):
            corrected_essay = apply_corrections_grammar(essay, combination)
            corrected_versions.append(corrected_essay)
  
    return corrected_versions

# # Example usage
# essay = "This is a tess essay. The break through was amazing. The possitive effects were clear."
# corrections = [
#     {'original': 'tess', 'correction': 'test', 'type': 'Spelling', 'sentence': 1},
#     {'original': 'break through', 'correction': 'breakthrough', 'type': 'Spelling', 'sentence': 2},
#     {'original': 'possitive', 'correction': 'positive', 'type': 'Spelling', 'sentence': 3},
# ]

# # Generate corrected versions with combinations of 1, 2, and 3 corrections
# corrected_versions = generate_corrected_versions(essay, corrections)
# corrected_versions

In [8]:
correction_test_100essay_5corr = joblib.load('files/correction_test_100essay_5corr')

corrected_test_100data_5corr = []

for index, row in data_100.iterrows():
    try:
        corrected_essays = generate_corrected_versions(row['essay'], correction_test_100essay_5corr[index]) 
        # Append the result to the list
        corrected_test_100data_5corr.append(corrected_essays)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        corrected_test_100data_5corr.append(None)  # Append None or an empty dict if an error occurs

In [9]:
np.array(corrected_test_100data_5corr).shape

(100, 31)

In [10]:
joblib.dump(corrected_test_100data_5corr, 'files/corrected_test_100data_5corr')

['files/corrected_test_100data_5corr']