# Purpose

This file shows the steps we took to process the raw data files, zipped xml files.

## Connect with Google drive to access data

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [1]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/data/
#%cd /content/drive/MyDrive/Work/Frontline/data

/content/drive/MyDrive/data


In [3]:
import os
# check number of files 
num_files = len(os.listdir("."))
print("Number of files in the folder: ", num_files)

Number of files in the folder:  6


## Unzip the files 

The first step was to unzip the files and to move the unzipped xml files to another folder called "unzipped". Next, we check whether any zip file in the "raw" data folder is missing a corresponding xml file in the "unzipped" folder. 

In [None]:
# unzip all files 

import zipfile
import shutil

# Path to the raw folder
raw_folder = "./Raw"

# Path to the unzipped folder
unzipped_folder = "./unzipped"

# Loop through all the files in the raw folder
for filename in os.listdir(raw_folder):
    if filename.endswith(".zip"):
        # Check if the file is a valid zip file
        if zipfile.is_zipfile(os.path.join(raw_folder, filename)):
            # Create a ZipFile object for the current zip file
            with zipfile.ZipFile(os.path.join(raw_folder, filename), "r") as zip_ref:
                # Extract all the contents of the zip file to the unzipped folder
                zip_ref.extractall(unzipped_folder)
        else:
            print(f"File {filename} is not a valid zip file and will not be extracted.")

In [None]:
# check whether a zip file is missing a xml file 

# Loop through all the files in the Raw folder
for filename in os.listdir(raw_folder):
    if filename.endswith(".zip"):
        # Get the first part of the filename before the .zip extension
        name = filename.split(".")[0]

        # Loop through all the files in the unzipped folder
        for xml_filename in os.listdir(unzipped_folder):
            if xml_filename.startswith(name) and xml_filename.endswith(".xml"):
                break
        else:
            # A corresponding XML file doesn't exist
            print(f"No corresponding XML file exists for {filename}")

## Convert xml files to json files 

For further analysis, we decided to convert the xml files to json files for the following reasons: 

- we are dealing with big datasets 
- CSVs are slow to query and difficult to store efficiently 
- JSON supports more complex data structures

In [5]:
# load packages and set directories 
import os 
import pandas as pd
import xml.etree.ElementTree as ET 
from tqdm import tqdm # for progress bar 
import json

# Set up paths for input and output directories 
input_dir = "unzipped"
output_dir = "json2"


In [None]:
# Iterate over all the xml files in the directory
for xml_file in tqdm(sorted(os.listdir(input_dir)[695:])): 
  # Check if the file is an XML file
  if xml_file.endswith(".xml"):
    # Parse the xml file
    tree = ET.parse(os.path.join(input_dir, xml_file))
    # Get the root 
    root = tree.getroot()
    # Create list for converted output 
    json_file = []
    # Loop through each article, get the data and append it to the output file json_file
    for artikel in root.findall('artikel'):

      # Access static data by their xpath
      artikel_id_elem = artikel.find('metadaten/artikel-id')
      artikel_id = artikel_id_elem.text if artikel_id_elem is not None else None
      name_elem = artikel.find('metadaten/quelle/name')
      name = name_elem.text if name_elem is not None else None 
      jahrgang_elem = artikel.find('metadaten/quelle/jahrgang')
      jahrgang = jahrgang_elem.text if jahrgang_elem is not None else None
      datum_elem = artikel.find('metadaten/quelle/datum')
      datum = datum_elem.text if datum_elem is not None else None

      # Access variable data by their xpath 
      ressort_elem = artikel.find('inhalt/titel-liste/ressort')
      # Store data unless data is not available and None, then store as None 
      ressort = ressort_elem.text if ressort_elem is not None else None 

      titel_elem = artikel.find('inhalt/titel-liste/titel')
      titel = titel_elem.text if titel_elem is not None else None 

      untertitel_elem = artikel.find('inhalt/titel-liste/untertitel')
      untertitel = untertitel_elem.text if untertitel_elem is not None else None

      # Create list for text inputs 
      text = []
      # Find the 'text' element
      text_elem = artikel.find('inhalt/text')
      try: 
          # Extract all the 'p' elements inside the 'text' element
          p_elems = text_elem.findall('p')
          # Loop over the 'p' elements and extract their text content
          for p_elem in p_elems:
              p_text = p_elem.text
              # Only add text if text is not empty 
              if p_text is not None: 
                text.append(p_text)

      # If no text element exists, pass 
      except: 
          pass 

      # Create temporary dict to store all information 
      temp_dict = {}
      temp_dict['artikel_id'] = artikel_id
      temp_dict['name'] = name
      temp_dict['jahrgang'] = jahrgang
      temp_dict['datum'] = datum
      temp_dict['ressort'] = ressort
      temp_dict['titel'] = titel
      temp_dict['untertitel'] = untertitel
      temp_dict['text'] = text

      # Add the article dict to the output list 
      json_file.append(temp_dict)

    # Extract the prefix of the file name
    prefix = xml_file.split("_")[0]
  
    # Create the output file name
    output_file = os.path.join(output_dir, f"{prefix}.json")
        
    # Check if the output file already exists
    if os.path.exists(output_file):
        # Read the existing JSON data from the output file
        with open(output_file, "r") as f:
            json_data = json.load(f)
    else:
        # Create a new empty JSON data list
        json_data = []
        
    # Append the XML data to the JSON data
    for i in json_file:
      json_data.append(i)
        
    # Write the updated JSON data to the output file
    with open(output_file, "w") as f:
        json.dump(json_data, f)
  


  0%|          | 1/5239 [01:20<117:31:27, 80.77s/it]

### Testing Data Extraction

In [None]:
import re

In [None]:
art_per_src={}

In [None]:
# create a dictionary saving the number of articles per prefix usingthe xml names eg. MIB_250001_260000.xml

# list of all prefixes
prefixes= sorted([i.split(".")[0] for i in os.listdir("Raw")])
# list of all xml files
xmls=os.listdir(input_dir)
for prefix in prefixes:
  # list of number of articles of prefix by title name
  n_art=sorted([int(re.split("_|\.",xml)[-2]) for xml in xmls if xml.startswith(prefix)])
  # save the largest number (total number of articles pf that prefix) or 0 if no xml of that prefix present
  n_art = n_art[-1] if len(n_art)>0 else 0
  art_per_src[prefix]=n_art
# eg. prefix ANN has 352'684 articles
art_per_src["AAN"]

352684

In [None]:
# open every json file and check if it contains all articles
for json_file in tqdm(os.listdir("json")):
  if json_file.endswith("json"):
    prefix=json_file.split(".")[0]
    # open json file
    df=pd.read_json(os.path.join(output_dir,json_file))
    # compare size of dataframe to number of articles
    if len(df)!=art_per_src[prefix]:
      print(f"Number of articles in {prefix} json should be {art_per_src[prefix]} but is {len(df)}.")

100%|██████████| 2/2 [00:08<00:00,  4.13s/it]


things to do: 
- convert xml to json file
- do some descriptive analysis: number of articles per newspaper, number of newspapers, number of topics per newspaper, etc. 
- do filtering: german newspapers only, DA related topics only 
- topic analysis: run the filtered dataset through a generic topic model 