In [1]:
import joblib
import pandas as pd
import json
import numpy as np
from create_text_error import create_typos_from_text, replace_sampled_prepositions, break_sva

In [2]:
data = pd.read_csv('ASAP7 Test Set.tsv', sep='\t')
data.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19392,My at saying your patient is. Your patient whe...,12
1,18926,Once upon a time there was a young girl named ...,18
2,18318,Patience is hard. You always @CAPS1 it now not...,14
3,18904,"Patience, @CAPS6 essential part of our world. ...",14
4,19131,One time I was pacient was when my mum said th...,8


In [3]:
# Sample 100 data to get corrections from OpenAI
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19392,My at saying your patient is. Your patient whe...,12
1,18926,Once upon a time there was a young girl named ...,18
2,18318,Patience is hard. You always @CAPS1 it now not...,14
3,18904,"Patience, @CAPS6 essential part of our world. ...",14
4,19131,One time I was pacient was when my mum said th...,8


In [5]:
# Test with one essay sample
essay = data_100.iloc[20]['essay']

def get_text_error(text):
    typos = create_typos_from_text(text)
    preps = replace_sampled_prepositions(text, 2)
    svas = break_sva(text)
    return typos + preps + svas

In [6]:
get_text_error(essay)

[{'original': 'experience', 'error': 'experiecne', 'sentence': 1},
 {'original': 'person', 'error': 'eprson', 'sentence': 2},
 {'original': 'brother', 'error': 'borther', 'sentence': 3},
 {'original': 'dad', 'error': 'add', 'sentence': 4},
 {'original': 'brother', 'error': 'brohter', 'sentence': 5},
 {'original': 'with', 'error': 'of', 'sentence': 15},
 {'original': 'of', 'error': 'on', 'sentence': 4},
 {'original': 'think', 'error': 'thinks', 'sentence': 20},
 {'original': 'applied', 'error': 'apply', 'sentence': 7},
 {'original': 'born', 'error': 'bears', 'sentence': 9},
 {'original': 'likes', 'error': 'like', 'sentence': 19},
 {'original': 'doesn\x92t', 'error': 'doesn\x92ted', 'sentence': 18}]

In [7]:
# Initialize an empty list to store the JSON responses
error_lists = []

ctr = 0

# Iterate through each row in the DataFrame
for index, row in data_100.iterrows():
    try:
        # Extract the essay text
        essay = row['essay']
        # Get the error response
        error_list = get_text_error(essay)
        # Append the result to the list
        error_lists.append(error_list)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        error_lists.append(None)  # Append None or an empty dict if an error occurs
    ctr = ctr + 1
    print(f"Finished processing essay {ctr}")

Finished processing essay 1
Finished processing essay 2
Finished processing essay 3
Finished processing essay 4
Finished processing essay 5
Finished processing essay 6
Finished processing essay 7
Finished processing essay 8
Finished processing essay 9
Finished processing essay 10
Finished processing essay 11
Finished processing essay 12
Finished processing essay 13
Finished processing essay 14
Finished processing essay 15
Finished processing essay 16
Finished processing essay 17
Finished processing essay 18
Finished processing essay 19
Finished processing essay 20
Finished processing essay 21
Finished processing essay 22
Finished processing essay 23
Finished processing essay 24
Finished processing essay 25
Finished processing essay 26
Finished processing essay 27
Finished processing essay 28
Finished processing essay 29
Finished processing essay 30
Finished processing essay 31
Finished processing essay 32
Finished processing essay 33
Finished processing essay 34
Finished processing ess

In [8]:
len(error_lists)

100

In [9]:
error_array = np.array(error_lists, dtype=object)
print(error_array.shape)

(100,)


In [10]:
joblib.dump(error_array, 'files/error_lists_test_100')

['files/error_lists_test_100']

In [11]:
for i, row in enumerate(error_array):
    if isinstance(row, (list, tuple)):
        print(f"Row {i}: length = {len(row)}")
    else:
        print(f"Row {i}: not a list (type={type(row)})")


Row 0: length = 11
Row 1: length = 12
Row 2: length = 7
Row 3: length = 12
Row 4: length = 9
Row 5: length = 7
Row 6: length = 9
Row 7: length = 8
Row 8: length = 11
Row 9: length = 12
Row 10: length = 12
Row 11: length = 12
Row 12: length = 12
Row 13: length = 12
Row 14: length = 7
Row 15: length = 11
Row 16: length = 12
Row 17: length = 12
Row 18: length = 10
Row 19: length = 12
Row 20: length = 12
Row 21: length = 12
Row 22: length = 10
Row 23: length = 12
Row 24: length = 12
Row 25: length = 11
Row 26: length = 12
Row 27: length = 12
Row 28: length = 12
Row 29: length = 7
Row 30: length = 10
Row 31: length = 12
Row 32: length = 12
Row 33: length = 12
Row 34: length = 10
Row 35: length = 12
Row 36: length = 12
Row 37: length = 9
Row 38: length = 12
Row 39: length = 12
Row 40: length = 12
Row 41: length = 12
Row 42: length = 12
Row 43: length = 12
Row 44: length = 12
Row 45: length = 12
Row 46: length = 12
Row 47: length = 12
Row 48: length = 12
Row 49: length = 7
Row 50: length = 12

#### Get scores from 100 sampled essay (test set)

In [12]:
# Collect scores from sampled testing set
scores = data_100['domain1_score']
joblib.dump(scores, "files/scores_essay_100_test_ori")

['files/scores_essay_100_test_ori']