### Imports and Paths

In [2]:
# imports
import os
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm 
import re


In [3]:
# set paths
%cd drive/MyDrive/Work/Frontline/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [4]:
# connect with google drive
# from google.colab import drive
# drive.mount('/content/drive')

### Methods and Constants

In [5]:
# methods

def filter_strings(text, combination_list):
  """ function to compare if any of a list of words occur in a text or or list of texts 
  Parameters:
    - text (str or list of str): text which is checked 
    - combination_list (list of str): list of words which are checked if they occur in text 
  Returns:
    - boolean: True or False depending on if any of the words in combination_list occurs in text
  """

  text="".join(text).lower()
  combinations = combination_list
  for comb in combinations:
    
    if all(word in text for word in comb.split(" und ")):
        return True
  return False

In [6]:
comb1 = ['häusliche gewalt',
        'partnerschaftsgewalt',
        'partnergewalt',
        'femizid',
        'beziehungstat',
        'liebesdrama',
        'ehedrama',
        'liebestragödie',
        'eheliche gewalt',
        'ehekrieg',
        'innerfamiliäre gewalt',
        'innerhäusliche gewalt',
        'gewalt und ehe',
        'ehe und hölle'
        'gewalt und (freund(in)?|partner(in)?|mann|frau)\s',
        'vergewaltigen und (freund(in)?|partner(in)?|mann|frau)\s',
        'vergewaltigung und (freund(in)?|partner(in)?|mann|frau)\s',
        'missbrauch und (freund(in)?|partner(in)?|mann|frau)\s',
        'missbräuchlich und (freund(in)?|partner(in)?|mann|frau)\s',
        'gewalttätig und (freund(in)?|partner(in)?|mann|frau)\s',
        'verletzung und (freund(in)?|partner(in)?|mann|frau)\s',
        'verletzen und (freund(in)?|partner(in)?|mann|frau)\s',
        'übergriffig und (freund(in)?|partner(in)?|mann|frau)\s',
        'drohung und (freund(in)?|partner(in)?|mann|frau)\s',
        'drohen und (freund(in)?|partner(in)?|mann|frau)\s',
        'manipulation und (freund(in)?|partner(in)?|mann|frau)\s',
        'manipulieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'beleidigen und (freund(in)?|partner(in)?|mann|frau)\s',
        'beleidigung und (freund(in)?|partner(in)?|mann|frau)\s',
        'gaslighting und (freund(in)?|partner(in)?|mann|frau)\s',
        'schlagen und (freund(in)?|partner(in)?|mann|frau)\s',
        'zwingen und (freund(in)?|partner(in)?|mann|frau)\s', 
        'gezwungen und (freund(in)?|partner(in)?|mann|frau)\s',
        'zwang und (freund(in)?|partner(in)?|mann|frau)\s',
        'einsperren und (freund(in)?|partner(in)?|mann|frau)\s',
        'stalking und (freund(in)?|partner(in)?|mann|frau)\s',
        'stalken und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrollieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'kontrolle und (freund(in)?|partner(in)?|mann|frau)\s',
        'isolieren und (freund(in)?|partner(in)?|mann|frau)\s',
        'isolation und (freund(in)?|partner(in)?|mann|frau)\s',
        'hauen und (freund(in)?|partner(in)?|mann|frau)\s',
        'ohrfeige und (freund(in)?|partner(in)?|mann|frau)\s'

        ]

In [7]:
def check_if_complete(prefix):
  """ function to compare if json contains all available articles
  Parameters:
    - prefix (str): prefix of the journal that is checked for completeness
  Returns:
    - tuple (boolean, DataFrame)
      - boolean indicates weather or not the json file is complete
      - DataFrame returns the json data if its complete and None if incomplete
  """
  try:
    df=pd.read_json(os.path.join("json",prefix+".json"))
    # compare size of dataframe to number of articles
    if len(df)==art_per_src[prefix]:
      return (True,df)
    else:
      print(f"Number of articles in {prefix} json should be {art_per_src[prefix]} but is {len(df)}.")
      return (False, None)
  except:
    print(f"Error while parsing {prefix}")
    return (False, None)

In [8]:
def check_if_exported(prefix):
  """ function to check if the relevant articles of a journal have been exported already
  Parameters:
    - prefix (str): prefix of the journal that is checked
  Returns:
    - boolean: returns True if the relevant articles of a journal have been exported already, False otherwise
  """
  filtered=os.listdir("filtered")
  filtered = [file.split(".")[0]for file in filtered if file.endswith(".csv")]
  if prefix in filtered:
    return True
  else: 
    return False


### Testing Data Extraction

In [9]:
# create a dictionary saving the number of articles per prefix usingthe xml names eg. MIB_250001_260000.xml

art_per_src={}
# list of all prefixes
prefixes= sorted([i.split(".")[0] for i in os.listdir("Raw")])
# list of all xml files
xmls=os.listdir("unzipped")
for prefix in prefixes:
  # list of number of articles of prefix by title name
  n_art=sorted([int(re.split("_|\.",xml)[-2]) for xml in xmls if xml.startswith(prefix)])
  # save the largest number (total number of articles pf that prefix) or 0 if no xml of that prefix present
  n_art = n_art[-1] if len(n_art)>0 else 0
  art_per_src[prefix]=n_art
# eg. prefix ANN has 352'684 articles
art_per_src["AAN"]

352684

### Filter based on key words

In this step, all json files are looked at. Only if they are complete, ie. they contain all available articles, they are considered i nthe filtering, else the file is skipped. The filtering is based on a list of word combinations related to domestic violence. If an article contains any of these combinations, it is added to a dictionary. 

Note: If the json file has been previsouly filtered and the relevant articles were already exported, they are skipped as well.


In [12]:
# empty dictionary for DV articles
DV_art={}
for file in tqdm(sorted(os.listdir("json"))):
  #looping through all json files
  if file.endswith(".json"):
    prefix=file.split(".")[0]
    # checking if the json file is complete i.e. contains all available articles
    is_complete, df_temp=check_if_complete(prefix)
    if is_complete and not check_if_exported(prefix):
      #if the file is complete the articles containing DV key words are filtered out
      for row_j in range(len(df_temp)):
        if prefix not in DV_art:
            DV_art[prefix]=[]
        if filter_strings(df_temp.loc[row_j,:]["text"],comb1):
          # DV articles are added to a dictionary
          DV_art[prefix].append(df_temp.loc[row_j,:])

100%|██████████| 1/1 [00:12<00:00, 12.96s/it]


### Exporting relevant Articles
In this step all articles that are relaed to domestic relevance are exported as csv, using their prefix as file name. Journals with no relevant articles are exportes as well, with an empty csv to keep track of the filtering. 

In [23]:
for key in DV_art.keys():
  pd.DataFrame(DV_art[key]).to_csv("filtered/"+key+".csv")