# Purpose

This file shows the steps we took to sample and create the annotation dataset.

## Connect with Google drive to access data 

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [79]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


## Load data

In [81]:
import tqdm as tqdm
from collections import Counter
import os
import pandas as pd
import re 
from ast import literal_eval
import statistics
import matplotlib.pyplot as plt
import re
import pandas as pd

folder_path = "filtered_4_26"

### Method 1: get csv files 

In [82]:

dfs = []

# loop through files 
for filename in os.listdir(folder_path):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # import csv with text as list object 
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)
# combine files in df
df_filtered = pd.concat(dfs, ignore_index=True)

#### Create a random subset of the data 

In [284]:
# size of subset we want 
number = 1000

In [285]:
df_sample = df_filtered.sample(number,)# random_state=42)

### Method 2: get a csv file

In [85]:
df_sample_file = pd.read_csv('sample.csv', encoding='utf-8', index_col=0)

### Method 3: Manually selected dataset of relevant articles
--> ensuring that the dataset only contains relevant articles, espscially for testing

In [86]:
subset_dv=df_filtered.loc[(11483,14044,62494,49199,11047,14948,10565,31059,58890,20347,55396,56389,5528,18532,59435,8035,27119,12788,59992,21477,10331,26314,45356,61023,31865,48960,44587,17992,14763,60043,20540,4563,13213,6751,43374,41018,38770,24654,21936,29297,1869,33163,60220,61232,57613,48979,33785,51576,8300,7675),:]

### Select Data
--> specify which data set of the three above methods should be used in the following analysis

In [356]:
# uncomment for full data set
df_subset=df_filtered

# uncomment for random data of 100 from full data set
#df_subset=df_sample

# uncomment for sample data set from csv file
# df_subset= df_sample_file

# uncomment for manually selected articles 
# df_subset = subset_dv

## Adjust format for export

### Methods

In [88]:
def reformat_article(art, min_words=5, max_words=125):
  # remove genios styles 
  art = [re.sub(r'<.*?>', '', x) for x in art]

  # remove new line characters and preceeding whitespaces
  art = [x.strip() for x in art]

  #remove empty paragraphs
  art = [x for x in art if x.strip()]

  #remove paragraphs that are too long
  art = [x for x in art if len(x.split()) < max_words]

  #remove paragraphs that are too short, ie. by default 3 or fewer words
  art = [x for x in art if len(x.split()) >= min_words]  

  return art

In [89]:
def occurs(word, text):
  """ function to check if a words occurs in a text
  Parameters:
    - word (str): word that is searched for
    - text (str): text that is searched in 
  Returns:
    - boolean: returns True if word occurs in text, False otherwise
  """
  if len(re.findall(word,text))>0:
    return True
  else:
    return False


In [90]:
def filter_title(title):
  """ function to filter article by title
  Parameters:
    - title (str): title that is checked
  Returns:
    - boolean: returns True if either
        - there is no title
        - the title does not contain any of the words in the list exclude_titles
  """
  if type(title)!=str:
    return True
  title=title.strip()
  for ex in exclude_titles:
    if ex.lower() == title.lower(): 
      return False
  return True

## Cleaning text:
- remove newline characters
- remove paragraphs if too long or short
- remove genios styles 
- remove empty paragraphs
- remove duplicate articles

In [457]:
df_subset["text"] = [ reformat_article(art) for art in df_subset["text"]]

In [458]:
# remove "empty" articles, that wereremove in the previous step
df_subset=df_subset[df_subset['text'].notna()] 
df_subset=df_subset[df_subset['text'].apply(len)!=0]
df_subset.shape

(46288, 8)

In [459]:
df_subset=df_subset.drop_duplicates("text", keep="first")
df_subset.shape

(46288, 8)

In [460]:
df_subset_clean=df_subset

## Filter by content: 

### Filter Articles by Title

In [461]:
df_subset_clean.shape

(46288, 8)

In [462]:
exclude_titles=["Beratungsstellen", "Termine","Hilfe","Hier_finden_Sie_Hilfe_2sp","was - wann - wo","IN KÜRZE","Kurz notiert :","Dienstbereit - die Woche im Überblick"
]

In [463]:
# only keep articles with titles not in the exclude list
df_subset_clean=df_subset_clean[df_subset_clean["titel"].apply(filter_title)]
df_subset_clean.shape

(44467, 8)

### Filter by Text

In [478]:
def first_words_filter(text, number_of_words=3):
  """ function to filter article by its first words
  Parameters:
    - text (str): title that is checked
  Returns:
    - boolean: returns True if the first n words do not contain any of the words in the list first_words_to_exclude
  """
  # remove special characters
  text=re.sub("[/\-!@#$%^&*:.]", " ", text[0])
  first_words=text.split()[:number_of_words]

  # remove whitespace and convert to lower case
  first_words=[word.strip().lower() for word in first_words]
  for ex in first_words_to_exclude:
    if ex.lower() in first_words: 
      return False
  return True

In [514]:
df_subset_clean=df_subset_clean[df_subset_clean["text"].apply(first_words_filter)]
df_subset_clean.shape

(35457, 8)

In [512]:
first_words_to_exclude=[# Notufe, Beratungen
                        "Bereitschaftsdienst", "Hotline", "Notruf", "Hilfetelefon","behindertenfahrdienst","Polizeiinspektion", 
                        "Feuerwehr","rettungsdienst", "Notdienst","Bereitschaftspraxis","Öffnungszeiten","Vergiftungen",
                        "Ärztehaus","Selbsthilfegruppe","Leitstelle","Tel","Aids","Ambulante","ACE", 
                        "Club","Interventionsstelle","Frauenberatungsstelle","Rufnummer","Rufnummern", "apotheke", "hilfsangebot","hilfsangebote", 
                        "opferhilfe","Berufsbildungszentrum","opferschutz",
                        # Kampagnen, Akitonen
                        "kampagne", "aktion", "ring","initiative",


]

In [96]:
# Define regular expressions for weekdays, times, telephone numbers, and street names
weekday_regex = re.compile(r'(Mo|Di|Mi|Do|Fr|Sa|So)\s*[-–]\s*(Mo|Di|Mi|Do|Fr|Sa|So)')
time_regex = re.compile(r'\d{1,2}[:\.]\d{2}\s*bis\s*\d{1,2}[:\.]\d{2}\s*,\s*(Mo|Di|Mi|Do|Fr|Sa|So)\s*,\s*\d{1,2}\.\w+')
phone_regex = re.compile(r'\(?\d{5}\)?[/\s]*\d{1,5}\s*\d{1,5}')
street_regex = re.compile(r'[A-Z][a-z]*-?[A-Za-z]+-?[A-Za-z]*\s*\d+\w*')


In [97]:
# Define a function to remove the unwanted text snippets from a string
def clean_text(text):
    text = weekday_regex.sub('', text)
    text = time_regex.sub('', text)
    text = phone_regex.sub('', text)
    text = street_regex.sub('', text)
    return text.strip()

In [98]:
# Apply the clean_text function to the "text" column of the DataFrame
df_subset_elinor1['clean_text'] = df_subset_elinor1['text'].apply(clean_text)

TypeError: ignored

### Explode by paragraphs

In [None]:
# Explode "text" column
df_exploded= df.explode("text")
# Create "artikel_order" column
df_exploded["artikel_order"] = df_exploded.groupby("artikel_id").cumcount() + 1


### Randomly select one paragraph per article

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def count_sentences(text):
    return len(nltk.sent_tokenize(text))

In [None]:
df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)

In [None]:
# Define a function to randomly select one row from each group
def select_random_row(group):
    if group['num_sentences'].max() > 1:
        return group[group['num_sentences'] > 1].sample(n=1)
    else:
        return group.head(1)

In [None]:
# Apply the function to each group and combine the results
random_rows = df_subset_elinor1.groupby('artikel_id').apply(select_random_row).reset_index(drop=True)

In [None]:
final = random_rows[['text']]

## Export as csv

In [None]:
output_path = "elinor"

In [None]:
final.to_csv(output_path+"/annotation_test_05_14.csv", index=False, header = True,
                  encoding = 'utf-8')