# Purpose

This file shows the steps we took to sample and create the annotation dataset.

## Connect with Google drive to access data 

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [None]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


## Load data

In [None]:
import tqdm as tqdm
from collections import Counter
import os
import pandas as pd
import re 
from ast import literal_eval
import statistics
import matplotlib.pyplot as plt
import re
import pandas as pd

folder_path = "filtered_4_26"

### Method 1: get csv files 

In [None]:

dfs = []

# loop through files 
for filename in os.listdir(folder_path):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # import csv with text as list object 
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)
# combine files in df
df_filtered = pd.concat(dfs, ignore_index=True)

ParserError: ignored

#### Create a random subset of the data 

In [None]:
# size of subset we want 
number = 1000

In [None]:
df_sample = df_filtered.sample(number,)# random_state=42)

### Method 2: get a csv file

In [None]:
df_sample_file = pd.read_csv('sample.csv', encoding='utf-8', index_col=0)

### Method 3: Manually selected dataset of relevant articles
--> ensuring that the dataset only contains relevant articles, espscially for testing

In [None]:
subset_dv=df_filtered.loc[(11483,14044,62494,49199,11047,14948,10565,31059,58890,20347,55396,56389,5528,18532,59435,8035,27119,12788,59992,21477,10331,26314,45356,61023,31865,48960,44587,17992,14763,60043,20540,4563,13213,6751,43374,41018,38770,24654,21936,29297,1869,33163,60220,61232,57613,48979,33785,51576,8300,7675),:]

### Select Data
--> specify which data set of the three above methods should be used in the following analysis

In [None]:
# uncomment for full data set
df_subset=df_filtered

# uncomment for random data of 100 from full data set
#df_subset=df_sample

# uncomment for sample data set from csv file
# df_subset= df_sample_file

# uncomment for manually selected articles 
# df_subset = subset_dv

## Adjust format for export

### Methods

In [None]:
def reformat_article(art, min_words=5, max_words=125):
  # remove genios styles 
  art = [re.sub(r'<.*?>', '', x) for x in art]

  # remove new line characters and preceeding whitespaces
  art = [x.strip() for x in art]

  #remove empty paragraphs
  art = [x for x in art if x.strip()]

  #remove paragraphs that are too long
  art = [x for x in art if len(x.split()) < max_words]

  #remove paragraphs that are too short, ie. by default 3 or fewer words
  art = [x for x in art if len(x.split()) >= min_words]  

  return art

In [None]:
def occurs(word, text):
  """ function to check if a words occurs in a text
  Parameters:
    - word (str): word that is searched for
    - text (str): text that is searched in 
  Returns:
    - boolean: returns True if word occurs in text, False otherwise
  """
  if len(re.findall(word,text))>0:
    return True
  else:
    return False


In [None]:
def filter_title(title, to_exclude):
  """ function to filter article by title
  Parameters:
    - title (str): title that is checked
  Returns:
    - boolean: returns True if either
        - there is no title
        - the title does not contain any of the words in the list to_exclude
  """
  if type(title)!=str:
    return True
  title=title.strip()
  for ex in to_exclude:
    if ex.lower() == title.lower(): 
      return False
  return True

In [None]:
def first_words_filter(text,to_exclude, number_of_words=3):
  """ function to filter article by its first words
  Parameters:
    - text (str): title that is checked
  Returns:
    - boolean: returns True if the first n words do not contain any of the words in the list to_exclude
  """
  if type(text)==list:
    text=text[0]
  # remove special characters
  text=re.sub("[/\-!@#$%^&*:.]", " ", text)
  first_words=text.split()[:number_of_words]

  # remove whitespace and convert to lower case
  first_words=[word.strip().lower() for word in first_words]
  for ex in to_exclude:
    if ex.lower() in first_words: 
      return False
  return True

In [None]:
def regex_search(text, regex):
  if len(re.findall(regex,text))>0:
    return False
  else:
    return True


## Cleaning text:
- remove newline characters
- remove paragraphs if too long or short
- remove genios styles 
- remove empty paragraphs
- remove duplicate articles

In [None]:
df_subset["text"] = [ reformat_article(art) for art in df_subset["text"]]
df_subset.shape

In [None]:
# remove "empty" articles, that wereremove in the previous step
df_subset=df_subset[df_subset['text'].notna()] 
df_subset=df_subset[df_subset['text'].apply(len)!=0]
df_subset.shape

In [None]:
df_subset=df_subset.drop_duplicates("text", keep="first")
df_subset.shape

In [None]:
df_subset_clean=df_subset

## Filter by content: 

### Filter Articles by Title

In [None]:
exclude_titles=["Beratungsstellen", "Termine","Hilfe","Hier_finden_Sie_Hilfe_2sp","was - wann - wo","IN KÜRZE","Kurz notiert :","Dienstbereit - die Woche im Überblick","Was - wann -wo",
                "Beratung + Hilfe", "Beratung", "Nachrichten","Hilfe bei häuslicher Gewalt","Termine heute","kurz & bündig","tipps der redaktion",
]

In [None]:
# only keep articles with titles not in the exclude list
df_subset_clean=df_subset_clean[df_subset_clean["titel"].apply(filter_title,args=[exclude_titles])]
df_subset_clean.shape

In [None]:
#just for testing
#df_subset_clean[~df_subset_clean["titel"].apply(filter_title,args=[exclude_titles])]

### Filter by Text

In [None]:
art_words_to_exclude=[# Notufe, Beratungen
                        "Bereitschaftsdienst", "Hotline", "Notruf", "Hilfetelefon","behindertenfahrdienst","Polizeiinspektion", 
                        "Feuerwehr","rettungsdienst", "Notdienst","Bereitschaftspraxis","Öffnungszeiten","Vergiftungen",
                        "Ärztehaus","Selbsthilfegruppe","Leitstelle","Tel","Aids","Ambulante","ACE", 
                        "Club","Interventionsstelle","Frauenberatungsstelle","Rufnummer","Rufnummern", "apotheke", "hilfsangebot","hilfsangebote", 
                        "opferhilfe","Berufsbildungszentrum","opferschutz",
                        # Kampagnen, Akitonen
                        "kampagne", "aktion", "ring","initiative",


]

In [None]:
df_subset_clean=df_subset_clean[df_subset_clean["text"].apply(first_words_filter,args=[art_words_to_exclude])]
df_subset_clean.shape

### Filter by Paragraph

In [None]:
# df_subset_clean_all=df_subset_clean
# df_subset_clean=df_subset_clean.iloc[:10000,:]

In [None]:
df_subset_clean=df_subset_clean_all

In [None]:
# temporaray: step is part of format_data
df_exploded=df_exploded[df_exploded["name"]!="Falter (APA)"]
df_exploded=df_exploded[df_exploded["name"]!="Der Rheintaler"]

In [None]:
# Explode "text" column
df_exploded= df_subset_clean.explode("text")
# Create "artikel_order" column
df_exploded["artikel_order"] = df_exploded.groupby("artikel_id").cumcount() + 1
df_exploded.shape

In [None]:
# remove duplicated paragraphs
df_exploded=df_exploded.drop_duplicates("text")
df_exploded.shape

In [None]:
# remove hotlines, etc, if keywords contained in the first 3 words
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter,3,args=[art_words_to_exclude])]
df_exploded.shape

In [None]:
# remove paragraphs by keywords if container in the first 3 words
par_words_to_exclude=["statistik", "kriminalstatistik", "landeskriminalamt"]
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter, args=[par_words_to_exclude])]
df_exploded.shape

In [None]:
# phone numbers usually occur in paragraphs with hotlines/ help services
phone_regex ='\(?\d{4,5}\)?[/\s]*\d{1,5}\s*\d{1,5}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[phone_regex])]
df_exploded.shape

In [None]:
# times usually indicate opening hours/ events 
time_regex = '\d{1,2}[:\/.]\d{1,2}[:\/.]\d{2,4}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[time_regex])]
df_exploded.shape

In [None]:
# times usually indicate opening hours/ events 
weekday_regex = '(Mo|Di|Mi|Do|Fr|Sa|So)[-\\,s–\ ./]+((Mo|Di|Mi|Do|Fr|Sa|So)[-–./ ]+)?\d{1,2}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[weekday_regex])]
df_exploded.shape

In [None]:
# exclude stats
stats_regex="\d*([\.,]\d*)?[ ](Prozent|%)"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[stats_regex])]
df_exploded.shape

In [None]:
# exclude stats
stats_regex2="\d+( Fälle| Opfer| Frauen| Kinder)"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[stats_regex2])]
df_exploded.shape

In [None]:
# street names usually indicate events/ locations/ opening hours
street_regex="[A-z]+(str|straße|weg| Str|allee|gasse| Gasse|platz)[. ]+\d+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[street_regex])]
df_exploded.shape

In [None]:
# very short paragraphs usually are not part of the article
df_exploded.loc[:,"chars"]=df_exploded["text"].apply(len)
df_exploded=df_exploded[df_exploded["chars"]>60]
df_exploded.shape

In [None]:
# hours usually indicate the pararaph is containing opening hours, movie schedules, etc
hours_regex="(\d{1,2}.\d{2}, \d{1})+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[hours_regex])]
df_exploded.shape

In [None]:
# email addresses are usually in paragraphs with contact info
email_regex="[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[email_regex])]
df_exploded.shape

In [None]:
# links are usually in paragraphs with contact info
link_regex="(www\.)\S+\.\S+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[link_regex])]
df_exploded.shape

In [None]:
#exclude events
df_exploded=df_exploded[df_exploded.ressort!="Termine"]
df_exploded.shape

In [None]:
#df_exploded.text.apply(occurs,args=["statistisch"])#[df_exploded["chars"]<100].sort_values("chars",ascending=False)

### Excluding annotated Paragraphs

In [None]:
import json

In [None]:
#read json data
json_data=json.load(open("annotated/annotations_05_18.json",encoding='utf-8'))

In [None]:
#convert to dataframe
data=pd.DataFrame(json_data["documents"])

In [None]:
#for now: filter out paragraphs that have not been annotated 
data=data[data["annotations"].apply(len)>0]
#reset index
data=data.reset_index(drop=True)
data.head()

In [None]:
data["artikel_id"]=[entr[1].attributes_flat["artikel_id"] for entr in data.iterrows()]
data["artikel_order"]=[entr[1].attributes_flat["artikel_order"] for entr in data.iterrows()]
data["name"]=[entr[1].attributes_flat["name"] for entr in data.iterrows()]

In [None]:
data["artikel_order"]=data["artikel_order"].astype(int)

In [None]:
data=data[["artikel_id","name","annotations","artikel_order"]]

In [None]:
df_exploded=pd.merge(data,df_exploded,left_on=["artikel_id","artikel_order","name"],right_on=["artikel_id","artikel_order","name"], how="right")

In [None]:
df_exploded

Unnamed: 0,artikel_id,name,annotations,artikel_order,jahrgang,datum,ressort,titel,untertitel,text,chars,num_sentences
0,F3A4578D33A8603DF0573D3DE3CB2666_3080763,Neue Ruhr/Rhein Zeitung,"[{'str_start': None, 'str_stop': None, 'annota...",1,2022.0,20220128,Lokales,Mann (25) schlägt Ex-Freundin,Amtsgericht hat einen Beziehungsstreit im März...,Ein Mann (25) ist jetzt vom Schöffengericht am...,331,2
1,F3A4578D33A8603DF0573D3DE3CB2666_3080763,Neue Ruhr/Rhein Zeitung,"[{'str_start': None, 'str_stop': None, 'annota...",3,2022.0,20220128,Lokales,Mann (25) schlägt Ex-Freundin,Amtsgericht hat einen Beziehungsstreit im März...,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,581,5
2,F3A4578D33A8603DF0573D3DE3CB2666_3080763,Neue Ruhr/Rhein Zeitung,"[{'str_start': None, 'str_stop': None, 'annota...",4,2022.0,20220128,Lokales,Mann (25) schlägt Ex-Freundin,Amtsgericht hat einen Beziehungsstreit im März...,"Zunächst leugnete der Angeklagte, dass es über...",506,2
3,F3A4578D33A8603DF0573D3DE3CB2666_3080763,Neue Ruhr/Rhein Zeitung,"[{'str_start': None, 'str_stop': None, 'annota...",5,2022.0,20220128,Lokales,Mann (25) schlägt Ex-Freundin,Amtsgericht hat einen Beziehungsstreit im März...,Das Schöffengericht hatte es in diesem Fall of...,717,6
4,IRA-82182598,SÜDWEST PRESSE,"[{'str_start': None, 'str_stop': None, 'annota...",2,2021.0,20210412,ULM und NEU-ULM,Mehr häusliche Gewalt im Corona-Jahr registriert,,"""Gewalt in der Familie ist weder Privatsache n...",510,2
5,omp0000006023765,Münchner Merkur,"[{'str_start': None, 'str_stop': None, 'annota...",2,2020.0,20200615,Lokalteil,Diskrete und direkte Hilfe in Notsituationen,,"Sylvia Braun, Geschäftsleiterin des Frauen- un...",485,2
6,omp0000006023765,Münchner Merkur,"[{'str_start': None, 'str_stop': None, 'annota...",5,2020.0,20200615,Lokalteil,Diskrete und direkte Hilfe in Notsituationen,,"Das Angebot ist nicht nur kostenfrei, sondern ...",454,3
7,Poly_spon-00000000-0002-0001-0000-000160834454,SPIEGEL Plus,"[{'str_start': None, 'str_stop': None, 'annota...",1,2018.0,20181116,,"""Unvorstellbare Zahlen""",,"Frau Giffey, jeden zweiten bis dritten Tag wir...",156,2
8,Poly_spon-00000000-0002-0001-0000-000160834454,SPIEGEL Plus,"[{'str_start': None, 'str_stop': None, 'annota...",9,2018.0,20181116,,"""Unvorstellbare Zahlen""",,In Spanien gingen Frauen massenweise auf die S...,130,2
9,Poly_spon-00000000-0002-0001-0000-000160834454,SPIEGEL Plus,"[{'str_start': None, 'str_stop': None, 'annota...",16,2018.0,20181116,,"""Unvorstellbare Zahlen""",,"Das stimmt, und es wurde auch schon viel getan...",539,4


### Randomly select one paragraph per article

In [None]:
df_subset_elinor1=df_exploded

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def count_sentences(text):
    return len(nltk.sent_tokenize(text))

In [None]:
df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)

In [None]:
# Define a function to randomly select one row from each group
def select_random_row(group):
    if group['num_sentences'].max() > 1:
        return group[group['num_sentences'] > 1].sample(n=1)
    else:
        return group.head(1)

In [None]:
# Apply the function to each group and combine the results
random_rows = df_subset_elinor1.groupby('artikel_id').apply(select_random_row).reset_index(drop=True)

In [None]:
#artikel id & order are uniquely identifying
random_rows.duplicated(["artikel_id","artikel_order"]).sum()

0

In [None]:
random_rows.shape

(27945, 11)

## Export as csv

In [None]:
output_path = "elinor"

In [None]:
random_rows.to_csv(output_path+"/annotation_test_05_18.csv", index=False, header = True,
                  encoding = 'utf-8')