In [1]:
import os
from openai import OpenAI

In [2]:
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [3]:
def read_paragraph_files(directory):
    paragraphs = []
    files = sorted(os.listdir(directory))
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
                paragraphs.append(f.read().strip())
    return paragraphs

def generate_simple_prompt(paragraph1, paragraph2):
    prompt = (
        f"Given the two texts:\n\n"
        f"Paragraph 1:\n{paragraph1}\n\n"
        f"Paragraph 2:\n{paragraph2}\n\n"
        f"Determine if the two texts are written by the same author.\n\n"
    )
    return prompt

In [4]:
def generate_prompt(paragraph1, paragraph2):
    prompt = (
        f"Given the two texts:\n\n"
        f"Paragraph 1:\n{paragraph1}\n\n"
        f"Paragraph 2:\n{paragraph2}\n\n"
        f"On a scale of 0 to 1, with 0 indicating low confidence and 1 indicating high confidence, please provide a general assessment of the likelihood that the two texts were written by the same author. Your answer should reflect a moderate level of strictness in scoring, disregarding differences in topic and content. Focus on the following linguistic features to determine if the texts are likely written by the same author\n\n"
        f"1.**Punctuation Style**: Hyphens, brackets, colons, commas, parentheses, quotation marks\n"
        f"2.**Lexical and Grammatical Features**: Lexical variation and word choice; Grammatical categories and part of speech usage\n"
        f"3.**Sentence Structure and Quantitative Features**: Sentence complexity, length, and arrangement; Coherence and cohesion; Word, clause, and sentence length, frequency, and distributions\n"
        f"4.**Text and Discourse Features**: Narrative styles and speech events; Common expressions, idioms, tone and mood\n"
        f"5.**Spelling and Typographical Errors*: Spelling mistakes and typographical errors\n\n"
        f"In your analysis, give equal attention to identifying both the commonalities and distinctions between the texts to assess whether they share a similar writing style indicative of the same author.\n"
        f"First step: Understand the problem, Give the score of each feature. Then, carry out the plan and solve the problem step by step. Finally, show the overall confidence score, which the AVERAGE of all the 5 features confidence scores above).\n\n"
        f"Respond in a standard JSON format like: for each feature (name it feature1-5), there should be two keys and values (explanation and score), and finally there should be a overall explanation and score."
    )
    return prompt

In [11]:
def generate_cot_prompt(paragraph1, paragraph2):
    prompt = (
        f"Given the two texts:\n\n"
        f"Paragraph 1:\n{paragraph1}\n\n"
        f"Paragraph 2:\n{paragraph2}\n\n"
        f"On a scale of 0 to 1, with 0 indicating low confidence and 1 indicating high confidence, please provide a general assessment of the likelihood that the two texts were written by the same author. Your answer should reflect a moderate level of strictness in scoring, disregarding differences in topic and content. Focus on the linguistic features to determine if the texts are likely written by the same author\n\n"
        f"In your analysis, give equal attention to identifying both the commonalities and distinctions between the texts to assess whether they share a similar writing style indicative of the same author.\n"
        f"Respond in a standard JSON format, with only a brief explanation (key 'explanation') and a score (key 'score')."
    )
    return prompt

In [5]:
def ask_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "json_object"},
        messages = [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': f'''{prompt}'''}
        ],
        max_tokens=500,
        temperature=0.7
    )
    return response.choices[0].message.content

In [17]:
def generate_solutions(base_dir, solution_base_dir):
    # Get the first x problems, for testing
    problem_dirs = sorted([d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))])[322:]

    #for problem_dir in os.listdir(base_dir):
    for problem_dir in problem_dirs: 
        problem_path = os.path.join(base_dir, problem_dir)
        if os.path.isdir(problem_path):
            print(f"Processing directory: {problem_dir}")
            paragraphs = read_paragraph_files(problem_path)
            
            # directory for solutions
            result_dir = os.path.join(solution_base_dir, problem_dir)
            os.makedirs(result_dir, exist_ok=True)

            for i in range(len(paragraphs) - 1):
                paragraph1 = paragraphs[i]
                paragraph2 = paragraphs[i + 1]
                if paragraph1 and paragraph2:
                    #prompt = generate_prompt(paragraph1, paragraph2)
                    prompt = generate_cot_prompt(paragraph1, paragraph2)
                    result = ask_gpt(prompt)
                    
                    result_filename = f"para_{i+1}_and_{i+2}.json"
                    result_filepath = os.path.join(result_dir, result_filename)
                    save_result(result_dir, result_filename, result)
                    #print(f"Saved result to {result_filepath}")

In [8]:
def save_result(directory, filename, content):
    with open(os.path.join(directory, filename), 'w', encoding='utf-8') as f:
        f.write(content)

In [23]:
base_dir = 'data/train_processed'
solution_base_dir = 'data/train_solution'

generate_solutions(base_dir, solution_base_dir)

Processing directory: problem-28
Processing directory: problem-280
Processing directory: problem-2800
Processing directory: problem-2801
Processing directory: problem-2802
Processing directory: problem-2803
Processing directory: problem-2804
Processing directory: problem-2805
Processing directory: problem-2806
Processing directory: problem-2807
Processing directory: problem-2808
Processing directory: problem-2809
Processing directory: problem-281
Processing directory: problem-2810
Processing directory: problem-2811
Processing directory: problem-2812
Processing directory: problem-2813
Processing directory: problem-2814
Processing directory: problem-2815
Processing directory: problem-2816
Processing directory: problem-2817
Processing directory: problem-2818
Processing directory: problem-2819
Processing directory: problem-282
Processing directory: problem-2820
Processing directory: problem-2821
Processing directory: problem-2822
Processing directory: problem-2823
Processing directory: pro

In [13]:
base_dir = 'data/validation_processed'
solution_base_dir = 'data/validation_solution'

generate_solutions(base_dir, solution_base_dir)

Processing directory: problem-294
Processing directory: problem-260
Processing directory: problem-604
Processing directory: problem-436
Processing directory: problem-409
Processing directory: problem-851
Processing directory: problem-431
Processing directory: problem-603
Processing directory: problem-267
Processing directory: problem-869
Processing directory: problem-293
Processing directory: problem-856
Processing directory: problem-258
Processing directory: problem-251
Processing directory: problem-407
Processing directory: problem-635
Processing directory: problem-438
Processing directory: problem-860
Processing directory: problem-894
Processing directory: problem-632
Processing directory: problem-400
Processing directory: problem-858
Processing directory: problem-256
Processing directory: problem-893
Processing directory: problem-269
Processing directory: problem-867
Processing directory: problem-833
Processing directory: problem-659
Processing directory: problem-692
Processing dir

### Generate solutions with CoT prompt

In [18]:
base_dir = 'data/train_processed'
solution_base_dir = 'data/train_solution_cot'

generate_solutions(base_dir, solution_base_dir)

Processing directory: problem-1289
Processing directory: problem-129
Processing directory: problem-1290
Processing directory: problem-1291
Processing directory: problem-1292
Processing directory: problem-1293
Processing directory: problem-1294
Processing directory: problem-1295
Processing directory: problem-1296
Processing directory: problem-1297
Processing directory: problem-1298
Processing directory: problem-1299
Processing directory: problem-13
Processing directory: problem-130
Processing directory: problem-1300
Processing directory: problem-1301
Processing directory: problem-1302
Processing directory: problem-1303
Processing directory: problem-1304
Processing directory: problem-1305
Processing directory: problem-1306
Processing directory: problem-1307
Processing directory: problem-1308
Processing directory: problem-1309
Processing directory: problem-131
Processing directory: problem-1310
Processing directory: problem-1311
Processing directory: problem-1312
Processing directory: pro