In [1]:
import joblib
import pandas as pd
import json
import numpy as np
from create_text_error import create_typos_from_text, replace_sampled_prepositions, break_sva

In [2]:
data = pd.read_csv('ASAP1 Test Set.tsv', sep='\t')
data.head()

Unnamed: 0,essay_id,essay,domain1_score
0,827,I think computers have a postitive affect on p...,6
1,1477,I blive that computers have a lot of effects o...,6
2,234,Many people think that computers are not a goo...,8
3,801,"Dear Newspaper people, @CAPS1 you might heard ...",9
4,780,More and more people are using computers on a ...,9


In [3]:
# Sample 100 data to get corrections from OpenAI
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,827,I think computers have a postitive affect on p...,6
1,1477,I blive that computers have a lot of effects o...,6
2,234,Many people think that computers are not a goo...,8
3,801,"Dear Newspaper people, @CAPS1 you might heard ...",9
4,780,More and more people are using computers on a ...,9


In [4]:
# Test with one essay sample
essay = data_100.iloc[20]['essay']

def get_text_error(text):
    typos = create_typos_from_text(text)
    preps = replace_sampled_prepositions(text, 2)
    svas = break_sva(text)
    return typos + preps + svas

In [5]:
get_text_error(essay)

[{'original': 'Newspaper', 'error': 'Newpsaper', 'sentence': 1},
 {'original': 'year', 'error': 'yera', 'sentence': 4},
 {'original': 'would', 'error': 'wolud', 'sentence': 5},
 {'original': 'opinion', 'error': 'poinion', 'sentence': 6},
 {'original': 'astonishing', 'error': 'astonihsing', 'sentence': 7},
 {'original': 'of', 'error': 'against', 'sentence': 37},
 {'original': 'of', 'error': 'around', 'sentence': 11},
 {'original': 'know', 'error': 'knows', 'sentence': 16},
 {'original': 'going', 'error': 'go', 'sentence': 12},
 {'original': 'seem', 'error': 'seemed', 'sentence': 4},
 {'original': 'have', 'error': 'has', 'sentence': 9},
 {'original': 'be', 'error': 'is', 'sentence': 23}]

In [6]:
# Initialize an empty list to store the JSON responses
error_lists = []

ctr = 0

# Iterate through each row in the DataFrame
for index, row in data_100.iterrows():
    try:
        # Extract the essay text
        essay = row['essay']
        # Get the error response
        error_list = get_text_error(essay)
        # Append the result to the list
        error_lists.append(error_list)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        error_lists.append(None)  # Append None or an empty dict if an error occurs
    ctr = ctr + 1
    print(f"Finished processing essay {ctr}")

Finished processing essay 1
Finished processing essay 2
Finished processing essay 3
Finished processing essay 4
Finished processing essay 5
Finished processing essay 6
Finished processing essay 7
Finished processing essay 8
Finished processing essay 9
Finished processing essay 10
Finished processing essay 11
Finished processing essay 12
Finished processing essay 13
Finished processing essay 14
Finished processing essay 15
Finished processing essay 16
Finished processing essay 17
Finished processing essay 18
Finished processing essay 19
Finished processing essay 20
Finished processing essay 21
Finished processing essay 22
Finished processing essay 23
Finished processing essay 24
Finished processing essay 25
Finished processing essay 26
Finished processing essay 27
Finished processing essay 28
Finished processing essay 29
Finished processing essay 30
Finished processing essay 31
Finished processing essay 32
Finished processing essay 33
Finished processing essay 34
Finished processing ess

In [7]:
len(error_lists)

100

In [8]:
error_array = np.array(error_lists, dtype=object)
print(error_array.shape)

(100,)


In [9]:
joblib.dump(error_array, 'files/error_lists_test_100')

['files/error_lists_test_100']

#### Get scores from 100 sampled essay (test set)

In [10]:
# Collect scores from sampled testing set
scores = data_100['domain1_score']
joblib.dump(scores, "files/scores_essay_100_test_ori")

['files/scores_essay_100_test_ori']