# Purpose

This file shows the steps we took to sample and create the annotation dataset.

## Connect with Google drive to access data 

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [1]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


## Load data

In [3]:
import tqdm as tqdm
from collections import Counter
import os
import pandas as pd
import re 
from ast import literal_eval
import statistics
import matplotlib.pyplot as plt
import re
import pandas as pd

folder_path = "filtered_4_26"

### Method 1: get csv files 

In [4]:

dfs = []

# loop through files 
for filename in os.listdir(folder_path):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # import csv with text as list object 
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)
# combine files in df
df_filtered = pd.concat(dfs, ignore_index=True)

#### Create a random subset of the data 

In [5]:
# size of subset we want 
number = 1000

In [6]:
df_sample = df_filtered.sample(number,)# random_state=42)

### Method 2: get a csv file

In [None]:
df_sample_file = pd.read_csv('sample.csv', encoding='utf-8', index_col=0)

### Method 3: Manually selected dataset of relevant articles
--> ensuring that the dataset only contains relevant articles, espscially for testing

In [None]:
subset_dv=df_filtered.loc[(11483,14044,62494,49199,11047,14948,10565,31059,58890,20347,55396,56389,5528,18532,59435,8035,27119,12788,59992,21477,10331,26314,45356,61023,31865,48960,44587,17992,14763,60043,20540,4563,13213,6751,43374,41018,38770,24654,21936,29297,1869,33163,60220,61232,57613,48979,33785,51576,8300,7675),:]

### Select Data
--> specify which data set of the three above methods should be used in the following analysis

In [69]:
# uncomment for full data set
df_subset=df_filtered

# uncomment for random data of 100 from full data set
#df_subset=df_sample

# uncomment for sample data set from csv file
# df_subset= df_sample_file

# uncomment for manually selected articles 
# df_subset = subset_dv

## Adjust format for export

### Methods

In [8]:
def reformat_article(art, min_words=5, max_words=125):
  # remove genios styles 
  art = [re.sub(r'<.*?>', '', x) for x in art]

  # remove new line characters and preceeding whitespaces
  art = [x.strip() for x in art]

  #remove empty paragraphs
  art = [x for x in art if x.strip()]

  #remove paragraphs that are too long
  art = [x for x in art if len(x.split()) < max_words]

  #remove paragraphs that are too short, ie. by default 3 or fewer words
  art = [x for x in art if len(x.split()) >= min_words]  

  return art

In [9]:
def occurs(word, text):
  """ function to check if a words occurs in a text
  Parameters:
    - word (str): word that is searched for
    - text (str): text that is searched in 
  Returns:
    - boolean: returns True if word occurs in text, False otherwise
  """
  if len(re.findall(word,text))>0:
    return True
  else:
    return False


In [59]:
def filter_title(title, to_exclude):
  """ function to filter article by title
  Parameters:
    - title (str): title that is checked
  Returns:
    - boolean: returns True if either
        - there is no title
        - the title does not contain any of the words in the list to_exclude
  """
  if type(title)!=str:
    return True
  title=title.strip()
  for ex in to_exclude:
    if ex.lower() == title.lower(): 
      return False
  return True

In [51]:
def first_words_filter(text,to_exclude, number_of_words=3):
  """ function to filter article by its first words
  Parameters:
    - text (str): title that is checked
  Returns:
    - boolean: returns True if the first n words do not contain any of the words in the list to_exclude
  """
  if type(text)==list:
    text=text[0]
  # remove special characters
  text=re.sub("[/\-!@#$%^&*:.]", " ", text)
  first_words=text.split()[:number_of_words]

  # remove whitespace and convert to lower case
  first_words=[word.strip().lower() for word in first_words]
  for ex in to_exclude:
    if ex.lower() in first_words: 
      return False
  return True

In [144]:
def regex_search(text, regex):
  if len(re.findall(regex,text))>0:
    return False
  else:
    return True


## Cleaning text:
- remove newline characters
- remove paragraphs if too long or short
- remove genios styles 
- remove empty paragraphs
- remove duplicate articles

In [70]:
df_subset["text"] = [ reformat_article(art) for art in df_subset["text"]]
df_subset.shape

(63359, 8)

In [71]:
# remove "empty" articles, that wereremove in the previous step
df_subset=df_subset[df_subset['text'].notna()] 
df_subset=df_subset[df_subset['text'].apply(len)!=0]
df_subset.shape

(59658, 8)

In [72]:
df_subset=df_subset.drop_duplicates("text", keep="first")
df_subset.shape

(46288, 8)

In [73]:
df_subset_clean=df_subset

## Filter by content: 

### Filter Articles by Title

In [105]:
exclude_titles=["Beratungsstellen", "Termine","Hilfe","Hier_finden_Sie_Hilfe_2sp","was - wann - wo","IN KÜRZE","Kurz notiert :","Dienstbereit - die Woche im Überblick","Was - wann -wo",
                "Beratung + Hilfe", "Beratung", "Nachrichten","Hilfe bei häuslicher Gewalt","Termine heute","kurz & bündig","tipps der redaktion",
]

In [106]:
# only keep articles with titles not in the exclude list
df_subset_clean=df_subset_clean[df_subset_clean["titel"].apply(filter_title,args=[exclude_titles])]
df_subset_clean.shape

(44152, 8)

In [None]:
#just for testing
#df_subset_clean[~df_subset_clean["titel"].apply(filter_title,args=[exclude_titles])]

### Filter by Text

In [107]:
art_words_to_exclude=[# Notufe, Beratungen
                        "Bereitschaftsdienst", "Hotline", "Notruf", "Hilfetelefon","behindertenfahrdienst","Polizeiinspektion", 
                        "Feuerwehr","rettungsdienst", "Notdienst","Bereitschaftspraxis","Öffnungszeiten","Vergiftungen",
                        "Ärztehaus","Selbsthilfegruppe","Leitstelle","Tel","Aids","Ambulante","ACE", 
                        "Club","Interventionsstelle","Frauenberatungsstelle","Rufnummer","Rufnummern", "apotheke", "hilfsangebot","hilfsangebote", 
                        "opferhilfe","Berufsbildungszentrum","opferschutz",
                        # Kampagnen, Akitonen
                        "kampagne", "aktion", "ring","initiative",


]

In [108]:
df_subset_clean=df_subset_clean[df_subset_clean["text"].apply(first_words_filter,args=[art_words_to_exclude])]
df_subset_clean.shape

(35211, 8)

### Filter by Paragraph

In [109]:
df_subset_clean_all=df_subset_clean
df_subset_clean=df_subset_clean.iloc[:10000,:]

In [162]:
# Explode "text" column
df_exploded= df_subset_clean.explode("text")
# Create "artikel_order" column
df_exploded["artikel_order"] = df_exploded.groupby("artikel_id").cumcount() + 1
df_exploded.shape

(141800, 9)

In [163]:
# remove duplicated paragraphs
df_exploded=df_exploded.drop_duplicates("text")
df_exploded.shape

(68416, 9)

In [164]:
# remove hotlines, etc, if keywords containedin the first 3 words
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter,3,args=[art_words_to_exclude])]
df_exploded.shape

(66560, 9)

In [166]:
# remove paragraphs by keywords if container in the first 3 words
par_words_to_exclude=["statistik", "kriminalstatistik"]
df_exploded=df_exploded[df_exploded["text"].apply(first_words_filter, args=[par_words_to_exclude])]
df_exploded.shape

(66466, 9)

In [129]:
sample=df_exploded.sample(300)

In [167]:
phone_regex ='\(?\d{4,5}\)?[/\s]*\d{1,5}\s*\d{1,5}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[phone_regex])]
df_exploded.shape

(62004, 9)

In [168]:
time_regex = '\d{1,2}[:\/.]\d{1,2}[:\/.]\d{2,4}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[time_regex])]
df_exploded.shape

(60722, 9)

In [170]:
weekday_regex = '(Mo|Di|Mi|Do|Fr|Sa|So)[-\\s–\./]+((Mo|Di|Mi|Do|Fr|Sa|So)[-–./]+)?\s\d{1,2}'
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[weekday_regex])]
df_exploded.shape

(59493, 9)

In [172]:
stats_regex="\d*([\.,]\d*)?[ ](Prozent|%)"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[stats_regex])]
df_exploded.shape

(56913, 9)

In [181]:
street_regex="[A-z]+(str|weg| Str|allee|gasse| Gasse|platz)[. ]+\d+"
df_exploded=df_exploded[df_exploded["text"].apply(regex_search, args=[street_regex])]
df_exploded.shape

(54487, 9)

In [196]:
df_exploded["chars"]=df_exploded["text"].apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exploded["chars"]=df_exploded["text"].apply(len)


In [None]:
df_exploded[df_exploded["chars"]<50].sort_values("chars",ascending=False)

### Randomly select one paragraph per article

In [None]:
import nltk
nltk.download('punkt')

In [None]:
def count_sentences(text):
    return len(nltk.sent_tokenize(text))

In [None]:
df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)

In [None]:
# Define a function to randomly select one row from each group
def select_random_row(group):
    if group['num_sentences'].max() > 1:
        return group[group['num_sentences'] > 1].sample(n=1)
    else:
        return group.head(1)

In [None]:
# Apply the function to each group and combine the results
random_rows = df_subset_elinor1.groupby('artikel_id').apply(select_random_row).reset_index(drop=True)

In [None]:
final = random_rows[['text']]

## Export as csv

In [None]:
output_path = "elinor"

In [None]:
final.to_csv(output_path+"/annotation_test_05_14.csv", index=False, header = True,
                  encoding = 'utf-8')