In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_csv('schibsted_articles_uib_media_futures_2024.csv')

In [None]:
def extract_combined_text(component):
    try:
        component_list = json.loads(component)
        combined_text = []
        current_subtitle = ""

        for item in component_list:
            if item.get('type') == 'text':
                for paragraph in item.get('paragraphs', []):
                    text_info = paragraph.get('text', {})
                    text_value = text_info.get('value', '').strip()

                    if paragraph.get('blockType') == 'heading':
                        # Add subtitle if it has content
                        current_subtitle = text_value
                    elif paragraph.get('blockType') == 'paragraph' and text_value:
                        # Only add text if it's not empty
                        if current_subtitle:
                            paragraph_text = f"||{current_subtitle}||\n{text_value}"
                            current_subtitle = ""  # Clear subtitle after use
                        else:
                            paragraph_text = text_value

                        combined_text.append(paragraph_text)

            elif item.get('type') == 'fact':
                title = item.get('title', {}).get('value', '').lower()
                if "og anonyme kilder" in title or "bruk av anonyme kilder i" in title:
                    continue  # Skip these kinds of fact boxes

                fact_content = []

                for sub_item in item.get('paragraphs', []):
                    if sub_item.get('blockType') == 'list:unordered':
                        for list_item in sub_item.get('items', []):
                            list_value = list_item['value'].strip()
                            if list_value:
                                fact_content.append(list_value)

                    elif sub_item.get('blockType') == 'paragraph':
                        text_info = sub_item.get('text', {})
                        text_value = text_info.get('value', '').strip()
                        if text_value:
                            fact_content.append(text_value)

                if fact_content:
                    fact_text = ' '.join(fact_content)
                    combined_text.append(f"{{{{ {fact_text} }}}}")

        # Remove any extra newlines caused by empty entries
        return '\n\n'.join(filter(None, combined_text))

    except (json.JSONDecodeError, TypeError):
        return ""

In [4]:
processed_articles = []

for index, row in df.iterrows():
    article_id = row["ARTICLE_ID"]
    title = row["ARTICLE_TITLE"]
    combined_text = extract_combined_text(row["COMPONENTS"])

    processed_articles.append({
        'unique_id': article_id,
        'title': title,
        'original_text': combined_text,
        'oral_version': ""  # Placeholder 
    })


In [5]:

with open('articles_preprocessed.jsonl', 'w', encoding='utf-8') as f:
    for article in processed_articles:
        f.write(json.dumps(article, ensure_ascii=False) + '\n')

print(f"File successfully written")

File successfully written
