In [1]:
import json
from sklearn.model_selection import train_test_split
import random

In [3]:
def extract_claims(data):
    """Extracts claims and labels them as 'yes' or 'no' based on their type."""
    output = []
    for article in data:
        article_id = article["Article Id"]
        claims_list = []

        # Process supporting claims
        for idx, claim in enumerate(article.get("Claims", [])):
            claims_list.append({
                "claim_id": f"1.{idx + 1}",
                "claim": claim,
                "reference": "Yes"
            })

        # Process contradicting claims
        for idx, contradicting_claim in enumerate(article.get("Contradicting Claims", [])):
            claims_list.append({
                "claim_id": f"2.{idx + 1}",
                "claim": contradicting_claim,
                "reference": "No"
            })
        
        # Shuffle the claims within the article
        random.shuffle(claims_list)

        # Append structured data to output
        output.append({
            "article_id": article_id,
            "claims": claims_list
        })
    
    return output

def read_json_file(file_path):
    """Reads the data from a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def write_json_file(data, output_path):
    """Writes the processed data to a JSON file."""
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def split_dataset(data, test_size=0.2):
    """Splits the dataset into training and testing sets."""
    # Split the dataset
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    return train_data, test_data

### Convert Claims to JSON

In [9]:
# Specify the input and output file paths
input_file = '../../../columnist_data/claims_json/claims_400.json'
output_file = '../../../columnist_data/claims_json/hilalkaplan_claims.json'

# Read the input data
data = read_json_file(input_file)

# Process the data to extract claims
processed_data = extract_claims(data)

# Write the processed data to an output file
write_json_file(processed_data, output_file)

print(f"Processed data has been written to {output_file}")

Processed data has been written to ../../../columnist_data/claims_json/hilalkaplan_claims.json


### Split Dataset

In [6]:
input_file = '../../../finetune_data/hilalkaplan_claims/hilalkaplan_claims.json'
train_output_file = '../../../finetune_data/hilalkaplan_claims/hilalkaplan_claims_train.json'
test_output_file = '../../../finetune_data/hilalkaplan_claims/hilalkaplan_claims_test.json'

data = read_json_file(input_file)

# Split the data into train and test sets
train_data, test_data = split_dataset(data, test_size=0.2)

# Write the train and test datasets to JSON files
write_json_file(train_data, train_output_file)
write_json_file(test_data, test_output_file)

print(f"Training data has been written to {train_output_file}")
print(f"Testing data has been written to {test_output_file}")

Training data has been written to ../../../finetune_data/hilalkaplan_claims/hilalkaplan_claims_train.json
Testing data has been written to ../../../finetune_data/hilalkaplan_claims/hilalkaplan_claims_test.json
