In [1]:
import re
import ast
import json
import os

In [4]:
file = open("v2_narr_raw/v2_sources_summarized__narr__10050_10100.jsonl")
for line in file:
    article = json.loads(line)
    print(article['response'])

Here is the list of source summaries:

[
    {'Name': 'Court Documents', 'Original Name': 'court documents', 'Narrative function': 'Provides official records of the lawsuit and its withdrawal.', 'Perspective': 'Authoritative', 'Centrality': 'High', 'Justification': 'Court documents are the primary source of information about the lawsuit and its status, making them highly central to the article. Their perspective is authoritative as they provide factual information.'},
    {'Name': 'Tesla', 'Original Name': 'Tesla', 'Narrative function': 'Provides information about the company\'s actions and statements.', 'Perspective': 'Informative', 'Centrality': 'High', 'Justification': 'Tesla is the main subject of the article, and their actions and statements are central to the narrative. Their perspective is informative as they provide information about their own actions.'},
    {'Name': 'Elon Musk', 'Original Name': 'Elon Musk', 'Narrative function': 'Provides quotes and announcements about Tesla

In [17]:
def robust_json(x):
    try:
        return json.loads(x)
    except:
        return None

def robust_ast(x):
    try:
        return ast.literal_eval(x)
    except:
        return None

def split_curly_braces(input_string):
    pattern = r'\{([^{}]*)\}'
    matches = re.findall(pattern, input_string)
    return matches

def robust_parser(f_path: str, seen_urls: set):
    res = []
    file = open(f_path)
    for line in file:

        article = json.loads(line)
        if article['url'] not in seen_urls:
            seen_urls.add(article['url'])
            sources = split_curly_braces(article['response'].split("\n\n")[-1])

            parsed_sources = []
            for source in sources:
                source = "{" + source + "}"
                
                temp = robust_json(source)
                if not temp:
                    temp = robust_ast(source)
                if not temp:
                    #print("skipped source from article", what)
                    continue
                
                source = temp
                index = 0
                one_parsed_source = {}
                for key, value in source.items():
                    if index == 0:
                        one_parsed_source['Name'] = value
                    if index == 1:
                        one_parsed_source['Original name'] = value
                    if index == 2:
                        one_parsed_source['Narrative function'] = value
                    if index == 3:
                        one_parsed_source['Perspective'] = value
                    if index == 4:
                        one_parsed_source['Centrality'] = value
                    if index == 5:
                        one_parsed_source['Justification'] = value
                    index += 1
                
                parsed_sources.append(one_parsed_source)

            if len(parsed_sources) > 0:
                parsed_article = {}
                parsed_article['url'] = article['url']
                parsed_article['sources'] = parsed_sources
                res.append(parsed_article)

    #check for none type
    for article in res:
        for source in article['sources']:
            for key, value in source.items():
                if type(value) != str:
                    article['sources'].remove(source)

    return res

In [18]:
f_name = "v2_narr_raw/v2_sources_summarized__narr__10050_10100.jsonl"
res = robust_parser(f_name, set())

In [19]:
print(len(res))
print(type(res[0]))

48
<class 'dict'>


In [20]:
with open('test.json', 'w') as f:
    json.dump(res, f, indent=4)

In [22]:
test_path = os.path.join("v2_info_parsed", "v2_test_set.json")
train_path = os.path.join("v2_info_parsed", "v2_train_set.json")

test_f = open(test_path)
train_f = open(train_path)

test_data = json.load(test_f)
train_data = json.load(train_f)

all_urls = set()
for article in test_data:
    all_urls.add(article['url'])

for article in train_data:
    all_urls.add(article['url'])

In [23]:
print(len(all_urls))

58098


In [10]:
from datasets import load_from_disk
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import json
import logging
import os
import sys
from io import StringIO


#load in the data
source_df = pd.read_json(os.path.join("../data/full-source-scored-data.jsonl"), nrows=100, lines=True).iloc[0:]
article_d = load_from_disk('../all-coref-resolved')

# process the data into right format: article with annotated sentences
a_urls_lookup = set(source_df['article_url'])
filtered_article_d = article_d.filter(lambda x: x['article_url'] in a_urls_lookup, num_proc=10)

all_articles = filtered_article_d.to_pandas().merge(source_df, on='article_url')


Filter (num_proc=10): 100%|██████████| 496380/496380 [01:49<00:00, 4539.94 examples/s]
