# Purpose

This file shows the steps we took to sample and create the annotation dataset.

## Connect with Google drive to access data 

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [74]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


## Load data

In [76]:
import tqdm as tqdm
from collections import Counter
import os
import pandas as pd
import re 
from ast import literal_eval
import statistics
import matplotlib.pyplot as plt
import re
import pandas as pd

folder_path = "filtered_4_26"

### Method 1: get csv files 

In [77]:

dfs = []

# loop through files 
for filename in os.listdir(folder_path):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # import csv with text as list object 
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)
# combine files in df
df_filtered = pd.concat(dfs, ignore_index=True)

#### Create a random subset of the data 

In [78]:
# size of subset we want 
number = 5000

In [79]:
df_sample = df_filtered.sample(number,)# random_state=42)

### Method 2: get a csv file

In [80]:
df_sample_file = pd.read_csv('sample.csv', encoding='utf-8', index_col=0)

### Method 3: Manually selected dataset of relevant articles
--> ensuring that the dataset only contains relevant articles, espscially for testing

In [81]:
subset_dv=df_filtered.loc[(11483,14044,62494,49199,11047,14948,10565,31059,58890,20347,55396,56389,5528,18532,59435,8035,27119,12788,59992,21477,10331,26314,45356,61023,31865,48960,44587,17992,14763,60043,20540,4563,13213,6751,43374,41018,38770,24654,21936,29297,1869,33163,60220,61232,57613,48979,33785,51576,8300,7675),:]

### Select Data
--> specify which data set of the three above methods should be used in the following analysis

In [82]:
# uncomment for full data set
df_subset=df_filtered

# uncomment for random data of 100 from full data set
#df_subset=df_sample

# uncomment for sample data set from csv file
# df_subset= df_sample_file

# uncomment for manually selected articles 
# df_subset = subset_dv

## Adjust format for export

### Methods

In [83]:
def reformat_article(art, min_words=5, max_words=125):
  # remove genios styles 
  art = [re.sub(r'<.*?>', '', x) for x in art]

  # remove new line characters and preceeding whitespaces
  art = [x.strip() for x in art]

  #remove empty paragraphs
  art = [x for x in art if x.strip()]

  #remove paragraphs that are too long
  art = [x for x in art if len(x.split()) < max_words]

  #remove paragraphs that are too short, ie. by default 3 or fewer words
  art = [x for x in art if len(x.split()) >= min_words]  

  return art

In [84]:
def occurs(text,word):
  """ function to check if a words occurs in a text
  Parameters:
    - word (str): word that is searched for
    - text (str): text that is searched in 
  Returns:
    - boolean: returns True if word occurs in text, False otherwise
  """
  if type(text)!=str:
    return False
  text=text.lower()
  word=word.lower()
  if len(re.findall(word,text))>0:
    return True
  else:
    return False

In [85]:
def filter_title(title, to_exclude):
  """ function to filter article by title
  Parameters:
    - title (str): title that is checked
  Returns:
    - boolean: returns True if either
        - there is no title
        - the title does not contain any of the words in the list to_exclude
  """
  if type(title)!=str:
    return True
  title=title.strip()
  for ex in to_exclude:
    if ex.lower() == title.lower(): 
      return False
  return True

In [86]:
def first_words_filter(text,to_exclude, number_of_words=3):
  """ function to filter article by its first words
  Parameters:
    - text (str): title that is checked
  Returns:
    - boolean: returns True if the first n words do not contain any of the words in the list to_exclude
  """
  if type(text)==list:
    text=text[0]
  # remove special characters
  text=re.sub("[/\-!@#$%^&*:.]", " ", text)
  first_words=text.split()[:number_of_words]

  # remove whitespace and convert to lower case
  first_words=[word.strip().lower() for word in first_words]
  for ex in to_exclude:
    if ex.lower() in first_words: 
      return False
  return True

In [87]:
def regex_search(text, regex):
  if len(re.findall(regex,text))>0:
    return False
  else:
    return True


## Cleaning text:
- remove newline characters
- remove paragraphs if too long or short
- remove genios styles 
- remove empty paragraphs
- remove duplicate articles

In [88]:
df_subset["text"] = [ reformat_article(art) for art in df_subset["text"]]
df_subset.shape

(63359, 8)

In [89]:
# remove "empty" articles, that wereremove in the previous step
df_subset=df_subset[df_subset['text'].notna()] 
df_subset=df_subset[df_subset['text'].apply(len)!=0]
df_subset.shape

(59658, 8)

In [90]:
df_subset=df_subset.drop_duplicates("text", keep="first")
df_subset.shape

(46288, 8)

In [178]:
df_subset_clean=df_subset

## Filter by content: 

In [179]:
# temporaray: step is part of format_data
df_subset_clean=df_subset_clean[df_subset_clean["name"]!="Falter (APA)"]
df_subset_clean=df_subset_clean[df_subset_clean["name"]!="Der Rheintaler"]

#### Duplicates: Filter out dupilcates across newspapers

In [186]:
df_subset_clean.artikel_id=df_subset_clean.artikel_id.astype(str)

In [191]:
#example of duplicates
df_subset_clean[df_subset_clean.duplicated(keep=False, subset=["artikel_id", "titel"])].sort_values("artikel_id").head(4)

Unnamed: 0,artikel_id,name,jahrgang,datum,ressort,titel,untertitel,text
52989,14054480,Münsterland Zeitung,2018.0,20180109,Nachrichten,Soldat schweigt nach tödlichem Drama,Ex-Freundin im Streit erwürgt,[Münster. Nach einem tödlichen Beziehungsdrama...
11737,14054480,Ruhr Nachrichten,2018.0,20180109,Nachrichten,Soldat schweigt nach tödlichem Drama,Ex-Freundin im Streit erwürgt,[Nach einem tödlichen Beziehungsdrama hat am M...
52990,14081543,Münsterland Zeitung,2018.0,20180126,Titelseite,Neues Konzept für Frauenhäuser,Düsseldorf. Die NRW-Landesregierung testet neu...,[Von Kirsten Bialdiga Um den Bedarf an Plätze...
11711,14081543,Ruhr Nachrichten,2018.0,20180126,Titelseite,Neues Konzept für Frauenhäuser,Düsseldorf. Die NRW-Landesregierung testet neu...,[Um den Bedarf an Plätzen in Frauenhäusern lan...


In [198]:
df_subset_clean=df_subset_clean.drop_duplicates(keep="first", subset=["artikel_id"])
df_subset_clean.shape

(45734, 8)

### Filter Articles by Title

In [199]:
exclude_titles=["Beratungsstellen", "Termine","Hilfe","Hier_finden_Sie_Hilfe_2sp","was - wann - wo","IN KÜRZE","Kurz notiert :","Dienstbereit - die Woche im Überblick","Was - wann -wo",
                "Beratung + Hilfe", "Beratung", "Nachrichten","Hilfe bei häuslicher Gewalt","Termine heute","kurz & bündig","tipps der redaktion","WAS, WANN, WO","service","Hier gibt es Hilfe","kurZNotiert",
                "CORONA-NACHRICHTEN","Kalenderblatt"
]

In [200]:
# only keep articles with titles not in the exclude list (exact matches)
df_subset_clean=df_subset_clean[df_subset_clean["titel"].apply(filter_title,args=[exclude_titles])]
df_subset_clean.shape

(43536, 8)

In [201]:
# filter based on keyword search
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["beratungsstelle"])]
# articles usually contain contact info, funding, etc
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["frauenhaus"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["kampagne gegen"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["aktion gegen"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["statistik"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["corona-krise"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["fallzahlen"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["corona-krise"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["weißer ring"])]
df_subset_clean=df_subset_clean[~df_subset_clean.titel.apply(occurs, args=["Dienstbereit"])]
df_subset_clean.shape

(42107, 8)

### Filter by Ressort

In [202]:
#exclude certain type of articles
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Termine"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="TV"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="tipps und termine"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Tipps und Termine"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Fernsehen"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Leserbriefe"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Gottesdienste"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="Termine & Service"]
df_subset_clean=df_subset_clean[df_subset_clean.ressort!="SERVICE"]
df_subset_clean.shape

(41043, 8)

### Filter by Subtitle

In [203]:
df_subset_clean=df_subset_clean[df_subset_clean.untertitel!="TERMINE"]
df_subset_clean=df_subset_clean[df_subset_clean.untertitel!="Klatsch & Tratsch"]
df_subset_clean.shape

(41026, 8)

In [99]:
#keyword serach to exclude articles about statistics, victim protection programs
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["hotline"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["beratungsstelle"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["weißer ring"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["Fernsehserie"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["Hilfsangebote"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["Kriminalitätsstatistik"])]
df_subset_clean=df_subset_clean[~df_subset_clean.untertitel.apply(occurs, args=["Kriminalstatistik"])]

df_subset_clean.shape

(40593, 8)

### Filter by Text

In [204]:
art_words_to_exclude=[# Notufe, Beratungen
                        "Bereitschaftsdienst", "Hotline", "Notruf", "Hilfetelefon","behindertenfahrdienst","Polizeiinspektion", 
                        "Feuerwehr","rettungsdienst", "Notdienst","Bereitschaftspraxis","Öffnungszeiten","Vergiftungen",
                        "Ärztehaus","Selbsthilfegruppe","Leitstelle","Tel","Aids","Ambulante","ACE", 
                        "Club","Interventionsstelle","Frauenberatungsstelle","Rufnummer","Rufnummern", "apotheke", "hilfsangebot","hilfsangebote", 
                        "opferhilfe","Berufsbildungszentrum","opferschutz",
                        # Kampagnen, Akitonen
                        "kampagne", "aktion", "ring","initiative",


]

In [101]:
# Filter out if keywords occur in the first 3 words of a text
df_subset_clean=df_subset_clean[df_subset_clean["text"].apply(first_words_filter,args=[art_words_to_exclude])]
df_subset_clean.shape

(31836, 8)

### Filter by Paragraph

In [205]:
df_exploded=df_subset_clean

In [206]:
# Explode "text" column
df_exploded= df_exploded.explode("text")
# Create "artikel_order" column
df_exploded["artikel_order"] = df_exploded.groupby("artikel_id").cumcount() + 1
df_exploded.shape

(658416, 9)

In [207]:
# remove duplicated paragraphs
df_exploded=df_exploded.drop_duplicates("text")
df_exploded.shape

(251367, 9)

In [208]:
# remove hotlines, etc, if keywords contained in the first 3 words
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter,3,args=[art_words_to_exclude])]
df_exploded.shape

(238308, 9)

In [209]:
# remove paragraphs by keywords if container in the first 3 words
par_words_to_exclude=["statistik", "kriminalstatistik", "landeskriminalamt"]
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter, args=[par_words_to_exclude])]
df_exploded.shape

(238055, 9)

In [210]:
# phone numbers usually occur in paragraphs with hotlines/ help services
phone_regex ='\(?\d{4,5}\)?[/\s]*\d{1,5}\s*\d{1,5}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[phone_regex])]
df_exploded.shape

(209844, 9)

In [211]:
# times usually indicate opening hours/ events 
time_regex = '\d{1,2}[:\/.]\d{1,2}[:\/.]\d{2,4}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[time_regex])]
df_exploded.shape

(208456, 9)

In [212]:
# times usually indicate opening hours/ events 
weekday_regex = '(Mo|Di|Mi|Do|Fr|Sa|So)[-\\,s–\ ./]+((Mo|Di|Mi|Do|Fr|Sa|So)[-–./ ]+)?\d{1,2}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[weekday_regex])]
df_exploded.shape

(204902, 9)

In [213]:
# exclude stats
stats_regex="\d*([\.,]\d*)?[ ](Prozent|%)"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[stats_regex])]
df_exploded.shape

(196582, 9)

In [214]:
# exclude stats
stats_regex2="\d+ (Fälle|Opfer|Frauen|Kinder|Personen|Betroffene)"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[stats_regex2])]
df_exploded.shape

(191390, 9)

In [215]:
# street names usually indicate events/ locations/ opening hours
street_regex="[A-z]+(str|straße|weg| Str|allee|gasse| Gasse|platz)[. ]+\d+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[street_regex])]
df_exploded.shape

(184944, 9)

In [216]:
# very short paragraphs usually are not part of the article
df_exploded.loc[:,"chars"]=df_exploded["text"].apply(len)
df_exploded=df_exploded[df_exploded["chars"]>60]
df_exploded.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exploded.loc[:,"chars"]=df_exploded["text"].apply(len)


(163366, 10)

In [217]:
# hours usually indicate the pararaph is containing opening hours, movie schedules, etc
hours_regex="(\d{1,2}.\d{2}, \d{1})+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[hours_regex])]
df_exploded.shape

(162802, 10)

In [218]:
# email addresses are usually in paragraphs with contact info
email_regex="[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[email_regex])]
df_exploded.shape

(161895, 10)

In [219]:
# links are usually in paragraphs with contact info
link_regex="(www\.)\S+\.\S+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[link_regex])]
df_exploded.shape

(160700, 10)

In [220]:
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["Stadtteiltreff"])]
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["Plakataktion"])]
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["One Billion Rising"])]
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["Gewalt kommt nicht in die Tüte"])]
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["opferschutzorganisation"])]
df_exploded=df_exploded[~df_exploded["text"].apply(occurs, args=["Frauen helfen Frauen"])]
df_exploded.shape

(159473, 10)

### Excluding annotated Paragraphs

In [118]:
import json

In [119]:
#read json data
json_data=json.load(open("annotated/annotations_05_18.json",encoding='utf-8'))

In [120]:
#convert to dataframe
data=pd.DataFrame(json_data["documents"])

In [121]:
#for now: filter out paragraphs that have not been annotated 
data=data[data["annotations"].apply(len)>0]
#reset index
data=data.reset_index(drop=True)
data.head()

Unnamed: 0,id,text,annotations,attributes_flat
0,4572dea4-6a08-4f1e-b312-5821112bb5f5,Ein Mann (25) ist jetzt vom Schöffengericht am...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
1,0bcada32-8dc5-41cf-b83b-67d2e742bada,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
2,a30791b9-522e-45c1-8b33-79d4165282af,"Zunächst leugnete der Angeklagte, dass es über...","[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
3,043e3909-bcdd-4c6b-a54f-f947d46ad18e,Das Schöffengericht hatte es in diesem Fall of...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
4,08cef91c-6d73-472c-8349-07a5b72009d1,"""Gewalt in der Familie ist weder Privatsache n...","[{'str_start': None, 'str_stop': None, 'annota...","{'artikel_id': 'IRA-82182598', 'name': 'SÜDWES..."


In [None]:
data["artikel_id"]=[entr[1].attributes_flat["artikel_id"] for entr in data.iterrows()]
data["artikel_order"]=[entr[1].attributes_flat["artikel_order"] for entr in data.iterrows()]
data["name"]=[entr[1].attributes_flat["name"] for entr in data.iterrows()]

In [123]:
data["artikel_order"]=data["artikel_order"].astype(int)

In [124]:
data=data[["artikel_id","name","annotations","artikel_order"]]

In [222]:
df_merged=pd.merge(data,df_exploded,left_on=["artikel_id","artikel_order","name"],right_on=["artikel_id","artikel_order","name"], how="right")

In [223]:
# remove paragraphs that already have annotations
df_exploded=df_merged[df_merged.annotations.isna()]

### Randomly select one paragraph per article

In [225]:
df_subset_elinor1=df_exploded

In [128]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [226]:
def count_sentences(text):
    return len(nltk.sent_tokenize(text))

In [227]:
df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)


In [228]:
# Define a function to randomly select one row from each group
def select_random_row(group):
    if group['num_sentences'].max() > 1:
        return group[group['num_sentences'] > 1].sample(n=1)
    else:
        return group.head(1)

In [229]:
# Apply the function to each group and combine the results
random_rows = df_subset_elinor1.groupby('artikel_id').apply(select_random_row).reset_index(drop=True)

## Export as csv

In [133]:
output_path = "elinor"

#### Export Files in chunks of 2000 paragraphs

In [236]:
batches=round(random_rows.shape[0]/2000+0.5)

In [251]:
for i in range(1,batches+1):
  chunk=random_rows.iloc[((i-1)*2000):(i*2000),:]
  chunk.to_csv(output_path+f"/annotation_test_05_22_part{i}.csv", index=False, header = True,encoding = 'utf-8')

#### Export as one File

In [134]:
#random_rows.to_csv(output_path+"/annotation_test_05_18.csv", index=False, header = True,encoding = 'utf-8')