# Purpose

This file shows the steps we took to process the raw data files, zipped xml files.

### Connect with Google drive to access data

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [11]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# redirect the working directory of this script to the data folder
#%cd /content/drive/MyDrive/data/
%cd /content/drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [6]:
import os
# check number of files 
num_files = len(os.listdir("Raw"))
print("Number of files in the folder: ", num_files)

Number of files in the folder:  257


## Unzip the files 

The first step was to unzip the files and to move the unzipped xml files to another folder called "unzipped". Next, we check whether any zip file in the "raw" data folder is missing a corresponding xml file in the "unzipped" folder. 

In [None]:
# unzip all files 

import zipfile
import shutil

# Path to the raw folder
raw_folder = "./Raw"

# Path to the unzipped folder
unzipped_folder = "./unzipped"

# Loop through all the files in the raw folder
for filename in os.listdir("./Raw")[-4:]:#os.listdir(raw_folder):
    if filename.endswith(".zip"):
        # Check if the file is a valid zip file
        if zipfile.is_zipfile(os.path.join(raw_folder, filename)):
            # Create a ZipFile object for the current zip file
            with zipfile.ZipFile(os.path.join(raw_folder, filename), "r") as zip_ref:
                # Extract all the contents of the zip file to the unzipped folder
                zip_ref.extractall(unzipped_folder)
        else:
            print(f"File {filename} is not a valid zip file and will not be extracted.")

In [None]:
# check whether a zip file is missing a xml file 

# Loop through all the files in the Raw folder
for filename in os.listdir(raw_folder):
    if filename.endswith(".zip"):
        # Get the first part of the filename before the .zip extension
        name = filename.split(".")[0]

        # Loop through all the files in the unzipped folder
        for xml_filename in os.listdir(unzipped_folder):
            if xml_filename.startswith(name) and xml_filename.endswith(".xml"):
                break
        else:
            # A corresponding XML file doesn't exist
            print(f"No corresponding XML file exists for {filename}")

## Reading XMLS and filtering
In this step all prefixes are looped through. Before parsing the xmls files, it is checked if a csv file for that prefix already exists. If not, all the XMls with that prefix are read and filtered. If they contain words/ phrases associated with domestic violence, they are exported as csv, using their prefix as name. 

#### Constants

In [1]:
#FILTERED_PATH="filtered2"
FILTERED_PATH="filtered_4_20"

In [20]:
comb2=['häuslicher? gewalt',
        'partnerschaftsgewalt',
        'partnergewalt',
        'femizid',
        'beziehungstat',
        'liebesdrama',
        'ehedrama',
        'liebestragödie',
        'eheliche gewalt',
        'ehekrieg',
        'innerfamiliäre gewalt',
        'innerhäusliche gewalt',
        '\bgewalt\b und \behe\b',
        '\behe\b und hölle',
       "vergewaltigt und (freund(in)?|partner(in)?|mann|frau)\s",
       "missbraucht und (freund(in)?|partner(in)?|mann|frau)\s",
       "übergriffig und (freund(in)?|partner(in)?|mann|frau)\s",
       "gewalttätig und (freund(in)?|partner(in)?|mann|frau)\s",
      # "verletzt und (freund(in)?|partner(in)?|mann|frau)\s", 
       "\b(ge)?schlagen\b und (freund(in)?|partner(in)?|mann|frau)\s", 
       "missbrauchen und (freund(in)?|partner(in)?|mann|frau)\s",
       "missbrauch und (freund(in)?|partner(in)?|mann|frau)\s", 
       "verletzung und (freund(in)?|partner(in)?|mann|frau)\s",
       "gedroht und (freund(in)?|partner(in)?|mann|frau)\s",
       "gezwungen und (freund(in)?|partner(in)?|mann|frau)\s",
       "bedroht und (freund(in)?|partner(in)?|mann|frau)\s",
       "beleidigt und (freund(in)?|partner(in)?|mann|frau)\s",
       "beleidigung und (freund(in)?|partner(in)?|mann|frau)\s",
       "gaslighting und (freund(in)?|partner(in)?|mann|frau)\s",
       "manipuliert und (freund(in)?|partner(in)?|mann|frau)\s",
       "manipulieren und (freund(in)?|partner(in)?|mann|frau)\s",
       "gezwungen und (freund(in)?|partner(in)?|mann|frau)\s",
       "eingesperrt und (freund(in)?|partner(in)?|mann|frau)\s",
       "gestalkt und (freund(in)?|partner(in)?|mann|frau)\s",
       "einsperren und (freund(in)?|partner(in)?|mann|frau)\s",
        'stalken und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrollieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrolliert und (freund(in)?|partner(in)?|mann|frau)\s',
        'isolieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'isoliert und (freund(in)?|partner(in)?|mann|frau)\s',
        '\bhauen\b und (freund(in)?|partner(in)?|mann|frau)\s',
        'ohrfeige und (freund(in)?|partner(in)?|mann|frau)\s'
]

In [21]:
comb1 = ['häuslicher? gewalt',
        'partnerschaftsgewalt',
        'partnergewalt',
        'femizid',
        'beziehungstat',
        'liebesdrama',
        'ehedrama',
        'liebestragödie',
        'eheliche gewalt',
        'ehekrieg',
        'innerfamiliäre gewalt',
        'innerhäusliche gewalt',
        'gewalt und \behe\b',
        '\behe\b und hölle'
        'gewalt und (freund(in)?|partner(in)?|mann|frau)\s',
        'vergewaltigen und (freund(in)?|partner(in)?|mann|frau)\s',
        'vergewaltigung und (freund(in)?|partner(in)?|mann|frau)\s',
        'missbrauch und (freund(in)?|partner(in)?|mann|frau)\s',
        'missbräuchlich und (freund(in)?|partner(in)?|mann|frau)\s',
        'gewalttätig und (freund(in)?|partner(in)?|mann|frau)\s',
        'verletzung und (freund(in)?|partner(in)?|mann|frau)\s',
        'verletzen und (freund(in)?|partner(in)?|mann|frau)\s',
        'übergriffig und (freund(in)?|partner(in)?|mann|frau)\s',
        'drohung und (freund(in)?|partner(in)?|mann|frau)\s',
        'drohen und (freund(in)?|partner(in)?|mann|frau)\s',
        'manipulation und (freund(in)?|partner(in)?|mann|frau)\s',
        'manipulieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'beleidigen und (freund(in)?|partner(in)?|mann|frau)\s',
        'beleidigung und (freund(in)?|partner(in)?|mann|frau)\s',
        'gaslighting und (freund(in)?|partner(in)?|mann|frau)\s',
        '\b(ge)?schlagen\b und (freund(in)?|partner(in)?|mann|frau)\s',
        'zwingen und (freund(in)?|partner(in)?|mann|frau)\s', 
        'gezwungen und (freund(in)?|partner(in)?|mann|frau)\s',
        'zwang und (freund(in)?|partner(in)?|mann|frau)\s',
        'einsperren und (freund(in)?|partner(in)?|mann|frau)\s',
        'stalking und (freund(in)?|partner(in)?|mann|frau)\s',
        'stalken und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrollieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrolle und (freund(in)?|partner(in)?|mann|frau)\s',
        'isolieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'isolation und (freund(in)?|partner(in)?|mann|frau)\s',
        '\bhauen\b und (freund(in)?|partner(in)?|mann|frau)\s',
        'ohrfeige und (freund(in)?|partner(in)?|mann|frau)\s'
        ]

In [32]:
comb3=[
    "häuslicher? gewalt",
    "ehedrama",
    "familienstreit und partner(in)?",
    "gewalttätige[rn]? (ex-)?partner(in)?",
    "gewalttätige[rn]? Ehe(partner|mann|frau)?(in)?", #  -> ok
    "partnerschaftsgewalt",# -> good
    "femizid",# -> ok
    "gewalt in (der )?(\(ex-\))?(ex)?partnerschaft(en)?",
    "beziehungsgewalt",# -> good
    "beziehungstat",
    "gewalttätige[rn]? (ex(-)?)?freund(in)?",
    "beziehungsdrama",# -> good
# gewalt gegen (männer|frauen) -> not good, politics
# partnergewalt c häusliche gewalt
# bluttat und partner(in)? -> not
# frauenhaus -> ok
# liebesdrama, tragödie -> not good, books
# eheliche gewalt -> no hits
# familiäre gewalt -> not good
# ehekrieg -> not good
# beziehungskrieg -> not good
]

comb_no_match=["filme?","putin","schauspielbühnen?"]

### Filtering and Exporting

#### Methods
For this step, the following methods are defined:
* parsing xml files
* testing if csv was exported already
* checking if text contains DV relatedwords


In [3]:
def parse_xml_file(xml_file):
  """ function to combine all xml files of a prefix and returns it as json
  Parameters:
    - prefix (str): prefix of the journal 
  Returns:
    - json: containing the combined xml files of a prefix as list
  """
  tree = ET.parse(os.path.join(input_dir, xml_file))
  # Get the root 
  root = tree.getroot()
  # Create list for converted output 
  json_file = []
  # Loop through each article, get the data and append it to the output file json_file
  for artikel in root.findall('artikel'):
    # Access static data by their xpath
    # Store data unless data is not available and None, then store as None 
    artikel_id = artikel.find('metadaten/artikel-id')
    artikel_id = artikel_id.text if artikel_id is not None else None 

    name = artikel.find('metadaten/quelle/name').text

    jahrgang = artikel.find('metadaten/quelle/jahrgang')
    jahrgang = jahrgang.text if jahrgang is not None else None

    datum = artikel.find('metadaten/quelle/datum')
    datum = datum.text if datum is not None else None


    # Access variable data by their xpath 
    ressort_elem = artikel.find('inhalt/titel-liste/ressort')
    # Store data unless data is not available and None, then store as None 
    ressort = ressort_elem.text if ressort_elem is not None else None 

    titel_elem = artikel.find('inhalt/titel-liste/titel')
    titel = titel_elem.text if titel_elem is not None else None 

    untertitel_elem = artikel.find('inhalt/titel-liste/untertitel')
    untertitel = untertitel_elem.text if untertitel_elem is not None else None

    # Create list for text inputs 
    text = []
    # Find the 'text' element
    text_elem = artikel.find('inhalt/text')
    try: 
        # Extract all the 'p' elements inside the 'text' element
        p_elems = text_elem.findall('p')
        # Loop over the 'p' elements and extract their text content
        for p_elem in p_elems:
            p_text = p_elem.text
            # Only add text if text is not empty 
            if p_text is not None: 
              text.append(p_text)

    # If no text element exists, pass 
    except: 
        pass 

    # Create temporary dict to store all information 
    temp_dict = {}
    temp_dict['artikel_id'] = str(artikel_id)
    temp_dict['name'] = name
    temp_dict['jahrgang'] = jahrgang
    temp_dict['datum'] = datum
    temp_dict['ressort'] = ressort
    temp_dict['titel'] = titel
    temp_dict['untertitel'] = untertitel
    temp_dict['text'] = text

    # Add the article dict to the output list 
    json_file.append(temp_dict)
  return json_file


In [37]:
def filter_strings(text, list_occur, list_not_occur):
  """ function to compare if any of a list of words occur in a text or or list of texts 
  Parameters:
    - text (str or list of str): text which is checked 
    - list_occur (list of str): list of words which are checked if they occur in text 
    - list_not_occur: list of words that should not occur in the text
  Returns:
    - boolean: True or False depending on if any of the words in list_occur occurs in text
  """
  text=" ".join(text).lower()

  for comb in list_occur:
    if all(occurs(word,text)  for word in comb.split(" und ")):
        for comb_not in list_not_occur:
          if occurs(comb_not, text):
            return False
        print(comb)
        return True
  return False

In [5]:
def check_if_exported(prefix):
  """ function to check if the relevant articles of a journal have been exported already
  Parameters:
    - prefix (str): prefix of the journal that is checked
  Returns:
    - boolean: returns True if the relevant articles of a journal have been exported already, False otherwise
  """
  filtered=os.listdir(FILTERED_PATH)
  filtered = [file.split(".")[0]for file in filtered if file.endswith(".csv")]
  if prefix in filtered:
    print(f"{prefix} csv already exported")
    return True
  else: 
    return False

In [6]:
def occurs( comb, text):
  if len(re.findall(comb,text))>0:
    return True
  else:
    return False


In [7]:
# load packages and set directories 
import os 
import pandas as pd
import xml.etree.ElementTree as ET 
from tqdm import tqdm # for progress bar 
import json
import re

# Set up paths for input and output directories 
input_dir = "unzipped"
output_dir = "json"


#### Filtering on Country
In this step all journals that are not Germany-based are excluded.

In [21]:
all_prefixes= sorted([i.split(".")[0] for i in os.listdir("Raw")])
all_journals={}

In [None]:
# creating a dictionary with all prefixes and their corresponding journal names
for prefix in all_prefixes:
  # list all xmls of a prefix
  for xml in os.listdir(input_dir):
    if xml.startswith(prefix+"_"):
      name=parse_xml_file(xml)[0]["name"]
      all_journals[prefix]=name
      break

#### Filtering based on key-words

In [38]:
# prefixes of journals from Switzerland, Austria, etc.
prefixes_to_exclude=['AGZ', "ANEW", "APZT","AWP", "AWPO", "AWPU", "BAZ", "BERN", "BEOB", "BLI", "BUND", "BWAI", "DIEW", "DOL", "ELNA","HZ", "HZO", "KLEI", "KRON", "KUR", "LUXT", "LZLZ", "NECH","NLZ", "NVB","NVT","NZZ", "NZZS", "OOEN", "PBN", "PRE", "PROF", "RVZ","SBLI","SN","STA","STG","TAG","TAS","THTA","TITA","VN","WEWO","WZ","ZSAS"]

In [39]:
# list with all prefixes
prefixes= sorted([i.split(".")[0] for i in os.listdir("Raw")])
# remove the non-German journals
prefixes=list(set(prefixes) - set(prefixes_to_exclude))

In [None]:
# looping through all prefixes
for prefix in tqdm(prefixes[:5]):
  if not check_if_exported(prefix):
    # list all xmls of a prefix
    xmls=[i for i in os.listdir(input_dir) if i.startswith(prefix+"_")]
    # create an empty list for the json files
    DV_art=[]

    # loop through all xmlfile
    for xml in xmls:
      # parse each xml
      parsed_xml=parse_xml_file(xml)
      #loop through all articles  
      for row_j in range(len(parsed_xml)):
        #check if articles are associated with DV
        if filter_strings(parsed_xml[row_j]["text"],comb3, comb_no_match):
            # DV articles are added to a list
            DV_art.append(parsed_xml[row_j])  
    pd.DataFrame(DV_art).to_csv(FILTERED_PATH+"/"+prefix+".csv")

#### Testing combinations

In [None]:
xmls=[i for i in os.listdir(input_dir) if i.startswith("BADZ_")]
    # create an empty list for the json files

DV_art=[]

# loop through all xmlfile
for xml in xmls:
  # parse each xml
  parsed_xml=parse_xml_file(xml)
  #loop through all articles  
  for row_j in range(len(parsed_xml)):
    #check if articles are associated with DV
    if filter_strings(parsed_xml[row_j]["text"],comb3, comb_no_match):
        # DV articles are added to a list
        DV_art.append(parsed_xml[row_j])  

In [None]:
print(len(DV_art))
DV_art

## to do list:

* **do filtering: DA related topics only**

* do descriptive analysis: number of articles per newspaper, number of newspapers, number of topics per newspaper, etc.

* do a collocation analysis: see gitub repo "newspaper" under scripts

* topic analysis: run the filtered dataset through a generic topic model
