In [1]:
import joblib
import pandas as pd
import nltk
import re
import itertools
import random
import numpy as np

In [2]:
data = pd.read_csv('ASAP1 Train Set.tsv', sep='\t')
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,176,Computers and the @CAPS1 were a technological ...,10
1,632,People using computers is a good way for them ...,10
2,1108,"Dear local newspaper, @CAPS1 you can see, abou...",9
3,1747,"Dear local Newspaper, I think computer technog...",8
4,1016,"Dear @CAPS1 @CAPS2, More and more people use c...",8


In [3]:
corrections = joblib.load('files/json_responses_train_100')
#correction_train_100essay_5corr = joblib.load('correction_train_100essay_5corr')

In [6]:
# Function to sample 5 dictionaries from each inner list
def sample_five_from_each(inner_lists):
    return [random.sample(inner_list, min(len(inner_list), 5)) for inner_list in inner_lists]

# Sample 5 dictionaries from each inner list
sampled_corrections = sample_five_from_each(corrections)

print(np.array(sampled_corrections).shape)

# Print the results
for idx, sampled_list in enumerate(sampled_corrections):
    print(f"Sampled data for list {idx + 1}:")
    for item in sampled_list:
        print(item)
    print()


(100, 5)
Sampled data for list 1:
{'original': 'meton', 'correction': 'met on', 'type': 'Spelling', 'sentence': 10}
{'original': 'offose', 'correction': 'oppose', 'type': 'Spelling', 'sentence': 4}
{'original': 'then', 'correction': 'them', 'type': 'Spelling', 'sentence': 12}
{'original': 'main cause to', 'correction': 'main cause of', 'type': 'Word Choice', 'sentence': 7}
{'original': 'cutt', 'correction': 'cut', 'type': 'Spelling', 'sentence': 19}

Sampled data for list 2:
{'original': 'reccommend', 'correction': 'recommend', 'type': 'Spelling', 'sentence': 5}
{'original': 'milli-seconds', 'correction': 'milliseconds', 'type': 'Spelling', 'sentence': 3}
{'original': 'woh', 'correction': 'who', 'type': 'Spelling', 'sentence': 12}
{'original': 'wonder ful', 'correction': 'wonderful', 'type': 'Spelling', 'sentence': 2}
{'original': 'let', 'correction': 'lets', 'type': 'Grammatical', 'sentence': 8}

Sampled data for list 3:
{'original': 'to:', 'correction': 'too,', 'type': 'Punctuation',

In [7]:
joblib.dump(sampled_corrections, 'files/correction_train_100essay_5corr')

['files/correction_train_100essay_5corr']

In [8]:
# Function to apply corrections to the essay
def apply_corrections_grammar(essay, corrections):
    # Tokenize the essay into sentences using NLTK
    sentences = nltk.sent_tokenize(essay)

    # Create a list to store corrected sentences
    corrected_sentences = []

    # Iterate through sentences
    for i, sentence in enumerate(sentences):
        # Apply corrections to the current sentence
        for correction in corrections:
            # Adjust the index by subtracting 1 to match the enumeration starting from 0
            if correction['sentence'] == i + 1:
                # Replace all occurrences of the incorrect text with the correction
                #sentence = sentence.replace(correction['original'], correction['correction'])
                 # Use regular expression with word boundaries to replace only whole words
                sentence = re.sub(r'\b' + re.escape(correction['original']) + r'\b', correction['correction'], sentence)
        # Add the corrected sentence to the list
        corrected_sentences.append(sentence)

    # Join the corrected sentences to form the final essay
    corrected_essay = ' '.join(corrected_sentences)
    return corrected_essay

# Function to generate all combinations of corrections and apply them
def generate_corrected_versions(essay, corrections):
    corrected_versions = []
    max_corrections = len(corrections)

    # Iterate over different numbers of corrections to apply
    for num_corrections in range(1, max_corrections + 1):
        for combination in itertools.combinations(corrections, num_corrections):
            corrected_essay = apply_corrections_grammar(essay, combination)
            corrected_versions.append(corrected_essay)
  
    return corrected_versions

# # Example usage
# essay = "This is a tess essay. The break through was amazing. The possitive effects were clear."
# corrections = [
#     {'original': 'tess', 'correction': 'test', 'type': 'Spelling', 'sentence': 1},
#     {'original': 'break through', 'correction': 'breakthrough', 'type': 'Spelling', 'sentence': 2},
#     {'original': 'possitive', 'correction': 'positive', 'type': 'Spelling', 'sentence': 3},
# ]

# # Generate corrected versions with combinations of 1, 2, and 3 corrections
# corrected_versions = generate_corrected_versions(essay, corrections)
# corrected_versions

In [9]:
correction_train_100essay_5corr = joblib.load('files/correction_train_100essay_5corr')

corrected_train_100data_5corr = []

for index, row in data_100.iterrows():
    try:
        correted_essays = generate_corrected_versions(row['essay'], correction_train_100essay_5corr[index]) 
        # Append the result to the list
        corrected_train_100data_5corr.append(correted_essays)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        corrected_train_100data_5corr.append(None)  # Append None or an empty dict if an error occurs

In [12]:
np.array(corrected_train_100data_5corr).shape

(100, 31)

In [13]:
joblib.dump(corrected_train_100data_5corr, 'files/corrected_train_100data_5corr')

['files/corrected_train_100data_5corr']