In [1]:
import joblib
import pandas as pd
from nltk.tokenize import sent_tokenize
from openai import OpenAI
import json
from counterfactual_inter_sbert import get_corrections_from_LLM_grammar, get_corrections_from_LLM_longer,get_prompt_grammar, get_prompt_longer

  from tqdm.autonotebook import tqdm, trange


In [2]:
data = pd.read_csv('ASAP7 Train Set.tsv', sep='\t')
data.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19441,Patience is tough to achieve sometimes. But th...,22
1,18068,Patience a helpful trait to get you through li...,24
2,19000,There were many of times when I was patient. T...,18
3,18339,A time I was patiet was when I was wating for ...,24
4,18297,Do you ever fell like your @CAPS1 is going to ...,21


In [3]:
# Sample 100 data to get corrections from OpenAI
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,19441,Patience is tough to achieve sometimes. But th...,22
1,18068,Patience a helpful trait to get you through li...,24
2,19000,There were many of times when I was patient. T...,18
3,18339,A time I was patiet was when I was wating for ...,24
4,18297,Do you ever fell like your @CAPS1 is going to ...,21


In [21]:
essay = data_100.iloc[72]['essay']
essay

'I forget to grab my helmet and ??? pads out of my mom\x92s car cause she works until @NUM1 and at @NUM2 that ??? ??? ???. Then my patience showed up.'

In [22]:
sentences = sent_tokenize(essay)

for i, sent in enumerate(sentences, start=1):
    print(f"Sentence {i}: {sent}")

Sentence 1: I forget to grab my helmet and ?
Sentence 2: ??
Sentence 3: pads out of my momÂ’s car cause she works until @NUM1 and at @NUM2 that ?
Sentence 4: ??
Sentence 5: ?
Sentence 6: ??
Sentence 7: ?
Sentence 8: ??.
Sentence 9: Then my patience showed up.


In [26]:
# ONLINE CORRECTIONS (REQUEST TO OPENAI's LLM)
essay = data_100.iloc[89]['essay']
model_name = 'gpt-4o-mini'
corrections_grammar = get_corrections_from_LLM_grammar(essay, model_name)
corrections_grammar

Requesting grammar corrections from LLM ... 


[{'original': 'Its',
  'correction': "It's",
  'type': 'Grammatical',
  'sentence': 4},
 {'original': 'your',
  'correction': "you're",
  'type': 'Grammatical',
  'sentence': 5},
 {'original': 'waiting for',
  'correction': 'waiting',
  'type': 'Word Choice',
  'sentence': 4},
 {'original': 'to propose, or ask',
  'correction': 'to propose or ask',
  'type': 'Punctuation',
  'sentence': 4},
 {'original': 'hard unless',
  'correction': "hard unless you're",
  'type': 'Grammatical',
  'sentence': 5}]

In [5]:
# Initialize an empty list to store the JSON responses
json_responses = []

# Iterate through each row in the DataFrame
for index, row in data_100.iterrows():
    try:
        # Extract the essay text
        essay = row['essay']
        # Get the JSON response from the function
        json_response = get_corrections_from_LLM_grammar(essay, model_name='gpt-4o')
        # Append the result to the list
        json_responses.append(json_response)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        json_responses.append(None)  # Append None or an empty dict if an error occurs

    # Print progress every 10 iterations
    if (index + 1) % 10 == 0:
        print(f"Processed {index + 1} / {len(data_100)} essays")

Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Processed 10 / 100 essays
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Processed 20 / 100 essays
Requesting grammar corrections from LLM ... 
Req

In [6]:
len(json_responses)

100

In [7]:
joblib.dump(json_responses, 'files/json_responses_train_100')

['files/json_responses_train_100']

#### Error Corrections (eg. Corrections less than 5, which becomes a problem later)

In [30]:
corrections = joblib.load('files/json_responses_train_100')

In [27]:
corrections[89]

[{'original': 'Its',
  'correction': "It's",
  'type': 'Grammatical',
  'sentence': 4},
 {'original': 'your',
  'correction': "you're",
  'type': 'Grammatical',
  'sentence': 5}]

In [28]:
corrections[89] = corrections_grammar

In [29]:
joblib.dump(corrections, 'files/json_responses_train_100')

['files/json_responses_train_100']

#### Check Missing Corrections (less than 5)

In [32]:
for i, inner in enumerate(corrections):
    if not isinstance(inner, (list, tuple)):
        print(f"Row {i}: not a list (type={type(inner)})")
    elif len(inner) < 5:
        print(f"Row {i}: length = {len(inner)}")