In [1]:
import joblib
import pandas as pd
import nltk
import re
import itertools
import numpy as np
import random

In [2]:
data = pd.read_csv('ASAP1 Test Set.tsv', sep='\t')
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,827,I think computers have a postitive affect on p...,6
1,1477,I blive that computers have a lot of effects o...,6
2,234,Many people think that computers are not a goo...,8
3,801,"Dear Newspaper people, @CAPS1 you might heard ...",9
4,780,More and more people are using computers on a ...,9


In [3]:
corrections = joblib.load('files/json_responses_test_100')
corrections

[[{'original': 'postitive',
   'correction': 'positive',
   'type': 'Spelling',
   'sentence': 1},
  {'original': 'affect',
   'correction': 'effect',
   'type': 'Word Choice',
   'sentence': 1},
  {'original': 'Your',
   'correction': 'your',
   'type': 'Grammatical',
   'sentence': 3},
  {'original': 'Computer',
   'correction': 'Computers',
   'type': 'Grammatical',
   'sentence': 4},
  {'original': 'exciseses',
   'correction': 'exercises',
   'type': 'Spelling',
   'sentence': 5},
  {'original': 'it take',
   'correction': 'it takes',
   'type': 'Grammatical',
   'sentence': 6},
  {'original': 'postitive',
   'correction': 'positive',
   'type': 'Spelling',
   'sentence': 9},
  {'original': 'affect',
   'correction': 'effect',
   'type': 'Word Choice',
   'sentence': 9}],
 [{'original': 'blive',
   'correction': 'believe',
   'type': 'Spelling',
   'sentence': 1},
  {'original': 'there',
   'correction': 'their',
   'type': 'Word Choice',
   'sentence': 1},
  {'original': 'there',

In [5]:
# Function to sample 5 dictionaries from each inner list
def sample_five_from_each(inner_lists):
    return [random.sample(inner_list, min(len(inner_list), 5)) for inner_list in inner_lists]

# Sample 5 dictionaries from each inner list
sampled_corrections = sample_five_from_each(corrections)

#print(np.array(sampled_corrections).shape)

# Print the results
for idx, sampled_list in enumerate(sampled_corrections):
    print(f"Sampled data for list {idx + 1}:")
    for item in sampled_list:
        print(item)
    print()


Sampled data for list 1:
{'original': 'it take', 'correction': 'it takes', 'type': 'Grammatical', 'sentence': 6}
{'original': 'affect', 'correction': 'effect', 'type': 'Word Choice', 'sentence': 1}
{'original': 'affect', 'correction': 'effect', 'type': 'Word Choice', 'sentence': 9}
{'original': 'postitive', 'correction': 'positive', 'type': 'Spelling', 'sentence': 9}
{'original': 'Computer', 'correction': 'Computers', 'type': 'Grammatical', 'sentence': 4}

Sampled data for list 2:
{'original': 'every on', 'correction': 'everyone', 'type': 'Spelling', 'sentence': 5}
{'original': 'thinkin', 'correction': 'thinking', 'type': 'Spelling', 'sentence': 4}
{'original': 'you @MONTH1 lose', 'correction': 'you might lose', 'type': 'Grammatical', 'sentence': 2}
{'original': 'act side', 'correction': 'activities outside', 'type': 'Word Choice', 'sentence': 5}
{'original': 'there', 'correction': 'their', 'type': 'Word Choice', 'sentence': 1}

Sampled data for list 3:
{'original': 'You', 'correction'

In [6]:
sampled_corrections

[[{'original': 'it take',
   'correction': 'it takes',
   'type': 'Grammatical',
   'sentence': 6},
  {'original': 'affect',
   'correction': 'effect',
   'type': 'Word Choice',
   'sentence': 1},
  {'original': 'affect',
   'correction': 'effect',
   'type': 'Word Choice',
   'sentence': 9},
  {'original': 'postitive',
   'correction': 'positive',
   'type': 'Spelling',
   'sentence': 9},
  {'original': 'Computer',
   'correction': 'Computers',
   'type': 'Grammatical',
   'sentence': 4}],
 [{'original': 'every on',
   'correction': 'everyone',
   'type': 'Spelling',
   'sentence': 5},
  {'original': 'thinkin',
   'correction': 'thinking',
   'type': 'Spelling',
   'sentence': 4},
  {'original': 'you @MONTH1 lose',
   'correction': 'you might lose',
   'type': 'Grammatical',
   'sentence': 2},
  {'original': 'act side',
   'correction': 'activities outside',
   'type': 'Word Choice',
   'sentence': 5},
  {'original': 'there',
   'correction': 'their',
   'type': 'Word Choice',
   'sen

In [7]:
joblib.dump(sampled_corrections, 'files/correction_test_100essay_5corr')

['files/correction_test_100essay_5corr']

In [8]:
# Function to apply corrections to the essay
def apply_corrections_grammar(essay, corrections):
    # Tokenize the essay into sentences using NLTK
    sentences = nltk.sent_tokenize(essay)

    # Create a list to store corrected sentences
    corrected_sentences = []

    # Iterate through sentences
    for i, sentence in enumerate(sentences):
        # Apply corrections to the current sentence
        for correction in corrections:
            # Adjust the index by subtracting 1 to match the enumeration starting from 0
            if correction['sentence'] == i + 1:
                # Replace all occurrences of the incorrect text with the correction
                #sentence = sentence.replace(correction['original'], correction['correction'])
                 # Use regular expression with word boundaries to replace only whole words
                sentence = re.sub(r'\b' + re.escape(correction['original']) + r'\b', correction['correction'], sentence)
        # Add the corrected sentence to the list
        corrected_sentences.append(sentence)

    # Join the corrected sentences to form the final essay
    corrected_essay = ' '.join(corrected_sentences)
    return corrected_essay

# Function to generate all combinations of corrections and apply them
def generate_corrected_versions(essay, corrections):
    corrected_versions = []
    max_corrections = len(corrections)

    # Iterate over different numbers of corrections to apply
    for num_corrections in range(1, max_corrections + 1):
        for combination in itertools.combinations(corrections, num_corrections):
            corrected_essay = apply_corrections_grammar(essay, combination)
            corrected_versions.append(corrected_essay)
  
    return corrected_versions

# # Example usage
# essay = "This is a tess essay. The break through was amazing. The possitive effects were clear."
# corrections = [
#     {'original': 'tess', 'correction': 'test', 'type': 'Spelling', 'sentence': 1},
#     {'original': 'break through', 'correction': 'breakthrough', 'type': 'Spelling', 'sentence': 2},
#     {'original': 'possitive', 'correction': 'positive', 'type': 'Spelling', 'sentence': 3},
# ]

# # Generate corrected versions with combinations of 1, 2, and 3 corrections
# corrected_versions = generate_corrected_versions(essay, corrections)
# corrected_versions

In [9]:
correction_test_100essay_5corr = joblib.load('files/correction_test_100essay_5corr')

corrected_test_100data_5corr = []

for index, row in data_100.iterrows():
    try:
        corrected_essays = generate_corrected_versions(row['essay'], correction_test_100essay_5corr[index]) 
        # Append the result to the list
        corrected_test_100data_5corr.append(corrected_essays)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        corrected_test_100data_5corr.append(None)  # Append None or an empty dict if an error occurs

In [10]:
np.array(corrected_test_100data_5corr).shape

(100, 31)

In [11]:
joblib.dump(corrected_test_100data_5corr, 'files/corrected_test_100data_5corr')

['files/corrected_test_100data_5corr']