In [1]:
import pandas as pd
import os
import json
import random
from sklearn.model_selection import train_test_split
import re

In [19]:
import json
import re
from sklearn.model_selection import train_test_split

def process_training_data(input_path, train_output_path, test_output_path, test_size=0.2):
    # Load the processed JSON data
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    flattened_claims = []

    for article in data:
        article_id = article["article_id"]

        # Process regular claims
        for claim, reasoning in zip(article["claims"], article["reasonings"]):
            # Clean double spaces
            claim = re.sub(r'\s{2,}', ' ', claim)
            reasoning = re.sub(r'\s{2,}', ' ', reasoning)

            # Extract first sentence of reasoning
            reasoning_split = reasoning.split('.', 1)
            reasoning_first_sentence = reasoning_split[0].strip()
            reasoning_remainder = reasoning_split[1].strip() if len(reasoning_split) > 1 else ""

            # Determine agreement type
            if "evet" in reasoning_first_sentence.lower():
                agreement = "Evet"
                reasoning = reasoning_remainder
            elif "hayır" in reasoning_first_sentence.lower():
                agreement = "Hayır"
                reasoning = reasoning_remainder
            else:
                agreement = "Unknown"

            # Add regular claim to flattened list
            flattened_claims.append({
                "article_id": article_id,
                "claim_id": len(flattened_claims) + 1,
                "claim": claim,
                "agreement": agreement,
                "reasoning": reasoning
            })

        # Process contradicting claims
        for contradicting_claim, contradicting_reasoning in zip(article["contradicting_claims"], article["contradicting_reasonings"]):
            # Clean double spaces
            contradicting_claim = re.sub(r'\s{2,}', ' ', contradicting_claim)
            contradicting_reasoning = re.sub(r'\s{2,}', ' ', contradicting_reasoning)

            # Extract first sentence of contradicting reasoning
            contradicting_reasoning_split = contradicting_reasoning.split('.', 1)
            contradicting_reasoning_first_sentence = contradicting_reasoning_split[0].strip()
            contradicting_reasoning_remainder = contradicting_reasoning_split[1].strip() if len(contradicting_reasoning_split) > 1 else ""

            # Determine agreement type
            if "evet" in contradicting_reasoning_first_sentence.lower():
                agreement = "Evet"
                contradicting_reasoning = contradicting_reasoning_remainder
            elif "hayır" in contradicting_reasoning_first_sentence.lower():
                agreement = "Hayır"
                contradicting_reasoning = contradicting_reasoning_remainder
            else:
                agreement = "Unknown"

            # Add contradicting claim to flattened list
            flattened_claims.append({
                "article_id": article_id,
                "claim_id": len(flattened_claims) + 1,
                "claim": contradicting_claim,
                "agreement": agreement,
                "reasoning": contradicting_reasoning
            })

    # Prepare data for stratified splitting
    agreements = [claim["agreement"] for claim in flattened_claims]

    # Stratified train-test split
    train_data, test_data = train_test_split(flattened_claims, test_size=test_size, stratify=agreements, random_state=42)

    # Save the training data
    os.makedirs(os.path.dirname(train_output_path), exist_ok=True)
    with open(train_output_path, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)

    # Save the testing data
    os.makedirs(os.path.dirname(test_output_path), exist_ok=True)
    with open(test_output_path, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=4)

    print(f"Training data saved to {train_output_path}")
    print(f"Testing data saved to {test_output_path}")

In [24]:
# Function to merge test files, shuffle, and save filtered claims
def merge_test_files(columnists, merged_output_path, evet_output_path):
    merged_test_data = []

    for columnist in columnists:
        test_file = f"../../../finetune_data/claim_reasoning/{columnist}/{columnist}_test.json"

        # Load the test data
        with open(test_file, 'r', encoding='utf-8') as f:
            test_data = json.load(f)

        # Append claim_owner to each claim
        for claim in test_data:
            claim["claim_owner"] = columnist
            merged_test_data.append(claim)

    # Shuffle the merged test data
    random.shuffle(merged_test_data)

    # Save the merged test data
    os.makedirs(os.path.dirname(merged_output_path), exist_ok=True)
    with open(merged_output_path, 'w', encoding='utf-8') as f:
        json.dump(merged_test_data, f, ensure_ascii=False, indent=4)

    print(f"Merged test data saved to {merged_output_path}")

    # Filter and save claims with agreement = "Evet"
    evet_claims = [claim for claim in merged_test_data if claim["agreement"] == "Evet"]
    with open(evet_output_path, 'w', encoding='utf-8') as f:
        json.dump(evet_claims, f, ensure_ascii=False, indent=4)

    print(f"Evet-only test data saved to {evet_output_path}")


In [25]:
# Example usage
columnists = ["mehmettezkan", "hilalkaplan", "ismailsaymaz"]
for columnist in columnists:
    input_json = f"../../../columnist_data/claim_reasoning/{columnist}.json"
    train_output_json = f"../../../finetune_data/claim_reasoning/{columnist}/{columnist}_train.json"
    test_output_json = f"../../../finetune_data/claim_reasoning/{columnist}/{columnist}_test.json"
    process_training_data(input_json, train_output_json, test_output_json)

# Merge all test files
merged_test_output_json = "../../../finetune_data/claim_reasoning/all_test.json"
evet_test_output_json = "../../../finetune_data/claim_reasoning/evet_test.json"

merge_test_files(columnists, merged_test_output_json, evet_test_output_json)

Training data saved to ../../../finetune_data/claim_reasoning/mehmettezkan/mehmettezkan_train.json
Testing data saved to ../../../finetune_data/claim_reasoning/mehmettezkan/mehmettezkan_test.json
Training data saved to ../../../finetune_data/claim_reasoning/hilalkaplan/hilalkaplan_train.json
Testing data saved to ../../../finetune_data/claim_reasoning/hilalkaplan/hilalkaplan_test.json
Training data saved to ../../../finetune_data/claim_reasoning/ismailsaymaz/ismailsaymaz_train.json
Testing data saved to ../../../finetune_data/claim_reasoning/ismailsaymaz/ismailsaymaz_test.json
Merged test data saved to ../../../finetune_data/claim_reasoning/all_test.json
Evet-only test data saved to ../../../finetune_data/claim_reasoning/evet_test.json
