In [1]:
import joblib
import pandas as pd
from nltk.tokenize import sent_tokenize
from openai import OpenAI
import json
from counterfactual_inter_sbert import get_corrections_from_LLM_grammar, get_corrections_from_LLM_longer,get_prompt_grammar, get_prompt_longer

  from tqdm.autonotebook import tqdm, trange


In [2]:
data = pd.read_csv('ASAP1 Train Set.tsv', sep='\t')
data.head()

Unnamed: 0,essay_id,essay,domain1_score
0,176,Computers and the @CAPS1 were a technological ...,10
1,632,People using computers is a good way for them ...,10
2,1108,"Dear local newspaper, @CAPS1 you can see, abou...",9
3,1747,"Dear local Newspaper, I think computer technog...",8
4,1016,"Dear @CAPS1 @CAPS2, More and more people use c...",8


In [3]:
# Sample 100 data to get corrections from OpenAI
data_100 = data[:100]
data_100.head()

Unnamed: 0,essay_id,essay,domain1_score
0,176,Computers and the @CAPS1 were a technological ...,10
1,632,People using computers is a good way for them ...,10
2,1108,"Dear local newspaper, @CAPS1 you can see, abou...",9
3,1747,"Dear local Newspaper, I think computer technog...",8
4,1016,"Dear @CAPS1 @CAPS2, More and more people use c...",8


In [4]:
essay = data_100.iloc[60]['essay']
essay

'@ORGANIZATION1, @CAPS1 computers benefit our society has been a hotly debated topic among citizens in our town. Many argue that computers provide quick easy communication, help children and adults learn, and even teach balancing habits to our generation. I firmly believe that computers are necessary in our society. @CAPS2, @CAPS3! I miss you!" are the words @ORGANIZATION1 @NUM1-year-old @PERSON1. @PERSON1 has been working overseas for two years in @LOCATION2, @LOCATION3. One @ORGANIZATION1 the only ways she can contact her father is through @ORGANIZATION3. Communication is extremely important in our society; it brings families and friends closer together. Computers provide a variety @ORGANIZATION1 forms @ORGANIZATION1 communication, such as @ORGANIZATION3, @CAPS5, and instant messaging. "@ORGANIZATION3" allows people to have what can be called a "visual phone conversation." A teary-eyed @PERSON2 says, "@CAPS4 since @CAPS7 wife died, @CAPS7 daughter has been @CAPS7 only companion. With

In [5]:
sentences = sent_tokenize(essay)

for i, sent in enumerate(sentences, start=1):
    print(f"Sentence {i}: {sent}")

Sentence 1: @ORGANIZATION1, @CAPS1 computers benefit our society has been a hotly debated topic among citizens in our town.
Sentence 2: Many argue that computers provide quick easy communication, help children and adults learn, and even teach balancing habits to our generation.
Sentence 3: I firmly believe that computers are necessary in our society.
Sentence 4: @CAPS2, @CAPS3!
Sentence 5: I miss you!"
Sentence 6: are the words @ORGANIZATION1 @NUM1-year-old @PERSON1.
Sentence 7: @PERSON1 has been working overseas for two years in @LOCATION2, @LOCATION3.
Sentence 8: One @ORGANIZATION1 the only ways she can contact her father is through @ORGANIZATION3.
Sentence 9: Communication is extremely important in our society; it brings families and friends closer together.
Sentence 10: Computers provide a variety @ORGANIZATION1 forms @ORGANIZATION1 communication, such as @ORGANIZATION3, @CAPS5, and instant messaging.
Sentence 11: "@ORGANIZATION3" allows people to have what can be called a "visual 

In [12]:
# ONLINE CORRECTIONS (REQUEST TO OPENAI's LLM)
essay = data_100.iloc[60]['essay']
model_name = 'gpt-4o-mini'
corrections_grammar = get_corrections_from_LLM_grammar(essay, model_name)
corrections_grammar

Requesting grammar corrections from LLM ... 


[{'original': 'has been',
  'correction': 'have been',
  'type': 'Grammatical',
  'sentence': 1},
 {'original': 'quick easy',
  'correction': 'quick and easy',
  'type': 'Word Choice',
  'sentence': 2},
 {'original': 'One @ORGANIZATION1 the only ways',
  'correction': 'One of the only ways',
  'type': 'Grammatical',
  'sentence': 8},
 {'original': 'a variety @ORGANIZATION1 forms @ORGANIZATION1 communication',
  'correction': 'a variety of forms of communication',
  'type': 'Grammatical',
  'sentence': 10},
 {'original': '@CAPS4 since @CAPS7 wife died, @CAPS7 daughter has been @CAPS7 only companion.',
  'correction': "@CAPS4 since @CAPS7's wife died, @CAPS7's daughter has been @CAPS7's only companion.",
  'type': 'Grammatical',
  'sentence': 12},
 {'original': 'out society',
  'correction': 'our society',
  'type': 'Word Choice',
  'sentence': 15},
 {'original': 'balance, extra-curricular activities',
  'correction': 'balance and extra-curricular activities',
  'type': 'Word Choice',
  

In [6]:
# Initialize an empty list to store the JSON responses
json_responses = []

# Iterate through each row in the DataFrame
for index, row in data_100.iterrows():
    try:
        # Extract the essay text
        essay = row['essay']
        # Get the JSON response from the function
        json_response = get_corrections_from_LLM_grammar(essay, model_name='gpt-4o')
        # Append the result to the list
        json_responses.append(json_response)
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        json_responses.append(None)  # Append None or an empty dict if an error occurs

    # Print progress every 10 iterations
    if (index + 1) % 10 == 0:
        print(f"Processed {index + 1} / {len(data_100)} essays")

Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Processed 10 / 100 essays
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Requesting grammar corrections from LLM ... 
Processed 20 / 100 essays
Requesting grammar corrections from LLM ... 
Req

In [7]:
len(json_responses)

100

In [8]:
joblib.dump(json_responses, 'files/json_responses_train_100')

['files/json_responses_train_100']

#### Error Corrections (eg. Corrections less than 5, which becomes a problem later)

In [13]:
corrections = joblib.load('files/json_responses_train_100')

In [16]:
corrections[60] = corrections_grammar

In [19]:
joblib.dump(corrections, 'files/json_responses_train_100')

['files/json_responses_train_100']