In [None]:
import json
import ast
import re

In [None]:
def read_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    json_object = json.loads(line.strip())
                    data.append(json_object)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON on line: {line.strip()}. Error: {e}")
    return data

def extract_sources(json_string):
    cleaned_string = json_string.strip()
    cleaned_string = re.sub(r'^```[a-zA-Z]*\s*\n?', '', cleaned_string, flags=re.MULTILINE)
    cleaned_string = re.sub(r'\n?```\s*$', '', cleaned_string)
    start_idx = cleaned_string.find('[')

    json_content = cleaned_string[start_idx:].strip()
    
    # Check if we need to add a closing bracket
    # Count opening and closing brackets to see if they're balanced
    open_brackets = json_content.count('[')
    close_brackets = json_content.count(']')
    
    if open_brackets > close_brackets:
        # Add missing closing brackets
        json_content += ']'
    
    try:
        data_list = ast.literal_eval(json_content)
    except:
        raise Exception
    return data_list

In [None]:
data = []
path1 = '../data/sources_1.jsonl'
file_data1 = read_jsonl(path1)
path2 = '../data/sources_2.jsonl'
file_data2 = read_jsonl(path2)

data.extend(file_data1)
data.extend(file_data2)
print(len(data))

In [None]:
articles = []
failures = 0
for point in data:
    news_article = {}
    raw_string = point['response']['body']['choices'][0]['message']['content']
    try:
        sources = extract_sources(raw_string)
        news_article['sources'] = sources
        news_article['sources_metadata'] = point
        articles.append(news_article)
    except:
        failures += 1

print("failed extraction: ", failures)
print("len articles: ", len(articles))

In [None]:
with open("../data/article_sources.json", "w") as f:
    json.dump(articles, f, indent=4)