<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/analyses/descriptive_analysis_article_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# packages
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from ast import literal_eval
import plotly.graph_objects as go
import plotly.express as px
import json


In [17]:
# load packages and set directories
import os
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm # for progress bar
import json
import re


In [18]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# change cwd
%cd drive/MyDrive/Work/Frontline/data
#%cd /content/drive/MyDrive/data/

[Errno 2] No such file or directory: 'drive/MyDrive/Work/Frontline/data'
/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


### Methods


In [20]:
comb3=[
    "häuslicher? gewalt",
    "ehedrama",
    "gewalttätige[rn]? (ex-)?partner(in)?",
    "gewalttätige[rn]? Ehe(partner|mann|frau)?(in)?",
    "partnerschaftsgewalt",
    "gewalt in (der )?(\(ex-\))?(ex)?partnerschaft(en)?",
    "beziehungsgewalt",
    "beziehungstat",
    "gewalttätige[rn]? (ex(-)?)?freund(in)?",
    "beziehungsdrama",]

In [21]:
# Set up paths
input_dir = "unzipped"
FILTERED_PATH="filtered/filtered_06_15"

In [22]:
def parse_xml_file(xml_file):
  """ function to combine all xml files of a prefix and returns it as json
  Parameters:
    - prefix (str): prefix of the journal
  Returns:
    - json: containing the combined xml files of a prefix as list
  """
  tree = ET.parse(os.path.join(input_dir, xml_file))
  # Get the root
  root = tree.getroot()
  # Create list for converted output
  json_file = []
  # Loop through each article, get the data and append it to the output file json_file
  for artikel in root.findall('artikel'):
    # Access static data by their xpath
    # Store data unless data is not available and None, then store as None
    artikel_id = artikel.find('metadaten/artikel-id')
    artikel_id = artikel_id.text if artikel_id is not None else None

    name = artikel.find('metadaten/quelle/name').text

    jahrgang = artikel.find('metadaten/quelle/jahrgang')
    jahrgang = jahrgang.text if jahrgang is not None else None

    datum = artikel.find('metadaten/quelle/datum')
    datum = datum.text if datum is not None else None


    # Access variable data by their xpath
    ressort_elem = artikel.find('inhalt/titel-liste/ressort')
    # Store data unless data is not available and None, then store as None
    ressort = ressort_elem.text if ressort_elem is not None else None

    titel_elem = artikel.find('inhalt/titel-liste/titel')
    titel = titel_elem.text if titel_elem is not None else None

    untertitel_elem = artikel.find('inhalt/titel-liste/untertitel')
    untertitel = untertitel_elem.text if untertitel_elem is not None else None

    # Create list for text inputs
    text = []
    # Find the 'text' element
    text_elem = artikel.find('inhalt/text')
    try:
        # Extract all the 'p' elements inside the 'text' element
        p_elems = text_elem.findall('p')
        # Loop over the 'p' elements and extract their text content
        for p_elem in p_elems:
            p_text = p_elem.text
            # Only add text if text is not empty
            if p_text is not None:
              text.append(p_text)

    # If no text element exists, pass
    except:
        pass

    # Create temporary dict to store all information
    temp_dict = {}
    temp_dict['artikel_id'] = str(artikel_id)
    temp_dict['name'] = name
    temp_dict['jahrgang'] = jahrgang
    temp_dict['datum'] = datum
    temp_dict['ressort'] = ressort
    temp_dict['titel'] = titel
    temp_dict['untertitel'] = untertitel
    temp_dict['text'] = text

    # Add the article dict to the output list
    json_file.append(temp_dict)
  return json_file


In [23]:
def occurs(word, text):
  """ function to check if a words occurs in a text
  Parameters:
    - word (str): word that is searched for
    - text (str): text that is searched in
  Returns:
    - boolean: returns True if word occurs in text, False otherwise
  """
  if len(re.findall(word,text))>0:
    return True
  else:
    return False


In [24]:
def filter_text_by_list(text, list_match):
  """ function to compare if any of a list of words occur in a text or or list of texts
  Parameters:
    - text (str or list of str): text which is checked
    - list_match (list of str): list of words which are checked if they occur in text
    - list_no_match (list of str): list of words that should not occur in the text
  Returns:
    - boolean: True or False depending on if any of the words in list_match occurs in text
  """
  text=" ".join(text).lower()

  for comb in list_match:
    if all(occurs(word,text)  for word in comb.split(" und ")):
      return True
  return False

In [25]:
def check_if_exported(prefix, filtered_path):
  """ function to check if the relevant articles of a journal have been exported already
  Parameters:
    - prefix (str): prefix of the journal that is checked
  Returns:
    - boolean: returns True if the relevant articles of a journal have been exported already, False otherwise
  """
  filtered=os.listdir(filtered_path)
  filtered = [file.split(".")[0]for file in filtered if file.endswith(".csv")]
  if prefix in filtered:
    print(f"{prefix} csv already exported")
    return True
  else:
    return False

### How many german articles in total?

In [26]:
prefixes_to_exclude=['AGZ', "ANEW", "APZT","AWP", "AWPO", "AWPU", "BAZ", "BERN", "BEOB", "BLI", "BUND", "BWAI", "DIEW", "DOL", "ELNA","HZ", "HZO", "KLEI", "KRON", "KUR", "LUXT", "LZLZ", "NECH","NLZ", "NVB","NVT","NZZ", "NZZS", "OOEN", "PBN", "PRE", "PROF", "RVZ","SBLI","SN","STA","STG","TAG","TAS","THTA","TITA","VN","WEWO","WZ","ZSAS", "NBPC","NBPC_part1","NBPC_part2","NBPC_part3","NBPC_part4","FALT","RTAL"]

In [27]:
prefixes=[i.split(".")[0] for i in os.listdir("Raw")]
prefixes=[i for i in prefixes if i not in prefixes_to_exclude]
all_files=os.listdir("unzipped")

In [28]:
total_num=0
for pre in prefixes:
  files_np=[f for f in all_files if f.startswith(pre)]
  files_np=[int(i.split(".")[0].split("_")[-1]) for i in files_np]
  if files_np==[]:
    print(pre)
  total_num+=max(files_np)

In [29]:
total_num

50964697

### How many articles after keyword search?

In [None]:
# looping through all prefixes
for prefix in tqdm(prefixes):
  #check if the prefixes articles were exported already
  if not check_if_exported(prefix, FILTERED_PATH):
    # list all xmls of a prefix
    xmls=[i for i in os.listdir(input_dir) if i.startswith(prefix+"_")]
    # create an empty list for the json files
    DV_art=[]

    # loop through all xmlfile
    for xml in xmls:
      # parse each xml
      parsed_xml=parse_xml_file(xml)

      #loop through all articles
      for art in parsed_xml:
        #check if articles are associated with DV
        if filter_text_by_list(art["text"],comb3):
            DV_art.append(art)

    DV_art=pd.DataFrame(DV_art)
    #some articles show up multiple times with different id, remove those
    DV_art=DV_art.drop_duplicates("text")

    DV_art.to_csv(FILTERED_PATH+"/"+prefix+".csv")

### How many aricles after keyword search contain references to help hotlines/ organizations?

In [46]:
from scripts import filtering

In [44]:

dfs = []

# loop through files
for filename in os.listdir(FILTERED_PATH):
    # if csv file, load and add to dfs
    if filename.endswith(".csv"):
        file_path = os.path.join(FILTERED_PATH, filename)
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)

# combine files in df
df = pd.concat(dfs, ignore_index=True)

In [52]:
df.text=[" ".join(i) for i in df.text]

In [56]:
filtered=filtering.regex_filter(df,"text", stats=False)

In [61]:
df.shape[0]-filtered.shape[0]

37966

In [65]:

fig=px.pie(names=["Artikel mit Verweisen zu Unterstützungsdiensten", "Artikel ohne Verweisen zu Unterstützungsdiensten"], title='Artikel zu Häuslicher Gewalt (nur Keyword Filtering)',values=[df.shape[0]-filtered.shape[0],filtered.shape[0]])
fig.update_layout(paper_bgcolor = "rgba(0,0,0,0)",
                  plot_bgcolor = "rgba(0,0,0,0)")
fig.show()

How many articles after all filtering?

In [42]:

dfs = []
for filename in os.listdir("elinor"):
    # if csv file, load and add to dfs
    if filename.startswith("annotation_test_05_22"):
        file_path = os.path.join("elinor", filename)
        df = pd.read_csv(file_path, index_col=0, sep=",")
        dfs.append(df)

# combine files in df
df = pd.concat(dfs, ignore_index=True)

In [43]:
df.shape

(25465, 10)

### Funnel Diagram

In [33]:
fig = go.Figure(go.Funnel(y=["Alle Artikel","Alle Artikel (DE)","Artikel nach Keyword-Filter",],
                    x=[60866588,50964697,71256]))
fig.show()

In [34]:
fig = go.Figure(go.Funnel(y=["Artikel nach Keyword-Filter","Artikel nach allen Filtern","Annotierte Artikel \n (jeweils 1 Paragraph)","Annotierte Paragraphen zu Häuslicher Gewalt","Annotierte Paragraphen der 3 schädlichen Kategorien" ],
                    x=[71256,25465, 3497, 1268,127]))
fig.show()