<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/export/elinor_export.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose

This file shows the steps we took to sample and create the annotation dataset.

## Connect with Google drive to access data

In order to access the data, you first need to create a shortcut of the data folder to your own Gdrive. If you've been granted editing rights, you should be able to edit the content of the folder, i.e. add, move and delete data, create and rename folders, etc.

In [1]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# redirect the working directory of this script to the data folder
%cd /content/drive/MyDrive/Work/Frontline/data/
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


## Load data

In [3]:
import tqdm as tqdm
from collections import Counter
import os
import pandas as pd
import re
from ast import literal_eval
import statistics
import matplotlib.pyplot as plt
import re
import pandas as pd

folder_path = "filtered/filtered_06_15 (only kw)"

### Method 1: get csv files

In [4]:

dfs = []

# loop through files
for filename in os.listdir(folder_path):
    # if csv file, load and add to dfs
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # import csv with text as list object
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)
# combine files in df
df_filtered = pd.concat(dfs, ignore_index=True)

#### Create a random subset of the data

In [5]:
# size of subset we want
number = 1100

In [6]:
df_sample = df_filtered.sample(number,)

### Method 2: get a csv file

In [7]:
#df_sample_file = pd.read_csv('sample.csv', encoding='utf-8', index_col=0)

### Method 3: Manually selected dataset of relevant articles
--> ensuring that the dataset only contains relevant articles, espscially for testing

In [8]:
subset_dv=df_filtered.loc[(11483,14044,62494,49199,11047,14948,10565,31059,58890,20347,55396,56389,5528,18532,59435,8035,27119,12788,59992,21477,10331,26314,45356,61023,31865,48960,44587,17992,14763,60043,20540,4563,13213,6751,43374,41018,38770,24654,21936,29297,1869,33163,60220,61232,57613,48979,33785,51576,8300,7675),:]

### Select Data
--> specify which data set of the three above methods should be used in the following analysis

In [9]:
# uncomment for full data set
df_subset=df_filtered

# uncomment for random data of 100 from full data set
#df_subset=df_sample

# uncomment for sample data set from csv file
#df_subset= df_sample_file

# uncomment for manually selected articles
# df_subset = subset_dv

## Adjust format for export

### Methods

In [10]:
def reformat_article(art, min_words=5, max_words=125):
  """ function to reformat an article: remove white spaces, empty paragraphs, genios styles, too long/ short paragraphs
  Parameters:
    - art (list of str): article as list of paragraphs
    - min_words (int): minimum number of word length a paragraph should have, default is 5
    - max_words (int):  maximum number of word length a paragraph should have, default is 125
  Returns:
    - art (list of str). artic
  """
  # remove genios styles
  art = [re.sub(r'<.*?>', '', x) for x in art]

  # remove new line characters and preceeding whitespaces
  art = [x.strip() for x in art]

  #remove empty paragraphs
  art = [x for x in art if x.strip()]

  #remove paragraphs that are too long
  art = [x for x in art if len(x.split()) < max_words]

  #remove paragraphs that are too short, ie. by default 5 or fewer words
  art = [x for x in art if len(x.split()) >= min_words]

  return art

## Cleaning text:
- remove newline characters
- remove paragraphs if too long or short
- remove genios styles
- remove empty paragraphs
- remove duplicate articles

In [11]:
#remove duplicates
df_subset.artikel_id=df_subset.artikel_id.astype(str)
df_subset=df_subset.drop_duplicates(keep="first", subset=["artikel_id","datum"])
df_subset=df_subset.drop_duplicates(keep="first", subset=["text"])
df_subset.shape

(55711, 8)

In [12]:
# clean articles
df_subset["text"] = [ reformat_article(art) for art in df_subset["text"]]
df_subset.shape

(55711, 8)

In [13]:
# remove "empty" articles, that wereremove in the previous step
df_subset=df_subset[df_subset['text'].notna()]
df_subset=df_subset[df_subset['text'].apply(len)!=0]
df_subset.shape

(52498, 8)

In [14]:
df_subset_clean=df_subset

## Filter by content:

In [15]:
from scripts import filtering

### Filter Articles by Title

In [16]:
exclude_exact_titles=["Beratungsstellen", "Termine","Hilfe","Hier_finden_Sie_Hilfe_2sp","was - wann - wo","IN KÜRZE","Kurz notiert :","Was - wann -wo",
                "Beratung + Hilfe", "Beratung", "Nachrichten","Hilfe bei häuslicher Gewalt","Termine heute","kurz & bündig","tipps der redaktion","WAS, WANN, WO","service","Hier gibt es Hilfe","kurZNotiert",
                "CORONA-NACHRICHTEN","Kalenderblatt"
]
exclude_titles=["beratungsstelle", "Frauenhaus","aktion gegen", "statistik","corona-krise","fallzahlen","dienstbereit","weißer ring","weißen ring","kampagne"]

In [17]:
# only keep articles with titles not in the exclude list:
# exact matches
df_subset_clean=filtering.filter_data(df_subset_clean,"titel",exclude_exact_titles, exact_match=True)
# keywordsearch
df_subset_clean=filtering.filter_data(df_subset_clean,"titel",exclude_titles, exact_match=False)

(50193, 8)
(48725, 8)


In [18]:
#without title: usualla events/ hotlines
df_subset_clean=df_subset_clean[~df_subset_clean["titel"].isna()]

### Filter by Ressort

In [19]:
exclude_exact_ressorts=["Termine","tv", "tipps und termine", "fernsehen","gottesdienste","Termine & Service","service","Leserbriefe","Leserservice"]

In [20]:
# only keep articles with ressorts not in the exclude list: exact matches
df_subset_clean=filtering.filter_data(df_subset_clean,"ressort",exclude_exact_ressorts, exact_match=True)

(35911, 8)


### Filter by Subtitle

In [21]:
exclude_exact_subtitles=["termine", "klatsch & tratsch"]
exclude_subtitles=["hotline","beratungsstelle","weißer ring","fernsehserie","hilfsangebote","Kriminalitätsstatistik","Kriminalstatistik", "in die Tüte", "Kampagne"]

In [22]:
# only keep articles with titles not in the exclude list: exact matches
df_subset_clean=filtering.filter_data(df_subset_clean,"untertitel",exclude_exact_subtitles, exact_match=True )
# only keep articles with titles not in the exclude list: exact matches
df_subset_clean=filtering.filter_data(df_subset_clean,"untertitel",exclude_subtitles, exact_match=False )

(35893, 8)
(35020, 8)


### Filter by Text

In [23]:
exclude_articles=[# Notufe, Beratungen
                        "Bereitschaftsdienst", "Hotline", "Notruf", "Hilfetelefon","behindertenfahrdienst","Polizeiinspektion",
                        "Feuerwehr","rettungsdienst", "Notdienst","Bereitschaftspraxis","Öffnungszeiten","Vergiftungen",
                        "Ärztehaus","Selbsthilfegruppe","Leitstelle","Tel","Aids","Ambulante","ACE",
                        "Club","Interventionsstelle","Frauenberatungsstelle","Rufnummer","Rufnummern", "apotheke", "hilfsangebot","hilfsangebote", "Klinikum"
                        "opferhilfe","Berufsbildungszentrum","opferschutz",
                        # Kampagnen, Akitonen
                        "kampagne", "aktion", "ring","initiative","Frauen helfen Frauen"
                        # Events, Services
                        "Mo Di","mi Do","do fr", "mo do","sa so", "sa mo","di mi","fr sa", "online","Ü50 Singletreff", "Uhr","Treffpunkt",
                        #corona
                        "Dieser Artikel wird laufend aktualisiert"


]

In [24]:
# Filter out if keywords occur in the first 5 words of a text
df_subset_clean=filtering.filter_data(df_subset_clean,"text",exclude_articles, exact_match=False ,first_words=5)

(28004, 8)


###Filter by Lengths

In [25]:
df_subset_clean["lengths"]=[len("\n".join(i).split()) for i in df_subset_clean.text]

In [26]:
df_subset_clean["lengths"].quantile([0,0.5,0.7,0.9,0.95,0.99,1])

0.00        5.00
0.50      350.00
0.70      510.00
0.90      786.00
0.95     1040.00
0.99     1813.94
1.00    42353.00
Name: lengths, dtype: float64

In [27]:
df_subset_clean=df_subset_clean[df_subset_clean.lengths<1000]

In [28]:
df_subset_clean.to_csv("filtered/filtered_06_16.csv", index=False)

### Filter by Paragraph

In [55]:
df_exploded=df_subset_clean

In [56]:
# Explode "text" column
df_exploded= df_exploded.explode("text")
# Create "artikel_order" column
df_exploded["artikel_order"] = df_exploded.groupby("artikel_id").cumcount() + 1
df_exploded.shape

(228753, 10)

In [57]:
# remove duplicated paragraphs
df_exploded=df_exploded.drop_duplicates("text")
df_exploded.shape

(154941, 10)

In [58]:
# remove hotlines, etc, if keywords contained in the first 5 words
df_exploded=filtering.filter_data(df_exploded,"text",exclude_articles,False,5)
# remove paragraphs by keywords if container in the first 5 words

exclude_paragraphs=["Stadtteiltreff","Plakataktion","One Billion Rising","Gewalt kommt nicht in die Tüte","opferschutzorganisation","Frauen helfen Frauen","statistik", "kriminalstatistik", "landeskriminalamt"]
df_exploded=filtering.filter_data(df_exploded,"text",exclude_paragraphs,False,5)

(144351, 10)
(143740, 10)


In [59]:
# regex filter: email, links, times, streets,email, weekdays
df_exploded=filtering.regex_filter(df_exploded,"text",)

In [60]:
# very short paragraphs usually are not part of the article
df_exploded.loc[:,"chars"]=df_exploded["text"].apply(len)
df_exploded=df_exploded[df_exploded["chars"]>60]
df_exploded.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exploded.loc[:,"chars"]=df_exploded["text"].apply(len)


(115556, 11)

### Excluding annotated Paragraphs

In [61]:
annot=pd.read_csv("annotated/230613_all_annotations.csv",index_col=0)

In [62]:
annot=annot[["artikel_id","name","annotations","artikel_order"]]

In [63]:
df_merged=pd.merge(annot,df_exploded,left_on=["artikel_id","artikel_order","name"],right_on=["artikel_id","artikel_order","name"], how="right")

In [None]:
df_merged

In [65]:
# remove paragraphs that already have annotations
df_exploded=df_merged[df_merged.annotations.isna()]

### Randomly select one paragraph per article

In [66]:
df_subset_elinor1=df_exploded

In [41]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [42]:
def count_sentences(text):
    return len(nltk.sent_tokenize(text))

In [67]:
df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset_elinor1['num_sentences'] = df_subset_elinor1['text'].apply(count_sentences)


In [68]:
# Define a function to randomly select one row from each group
def select_random_row(group):
    if group['num_sentences'].max() > 1:
        return group[group['num_sentences'] > 1].sample(n=1)
    else:
        return group.head(1)

In [69]:
# Apply the function to each group and combine the results
random_rows = df_subset_elinor1.groupby('artikel_id').apply(select_random_row).reset_index(drop=True)

In [70]:
random_rows=random_rows.loc[:,random_rows.columns!="annotations"]

## Export as csv

In [None]:
output_path = "elinor"

#### Export Files in chunks of 2000 paragraphs

In [None]:
batches=round(random_rows.shape[0]/2000+0.5)

In [71]:
random_rows.shape

(22520, 12)

In [None]:
# for i in range(1,batches+1):
#   chunk=random_rows.iloc[((i-1)*2000):(i*2000),:]
#   print(chunk.shape)
#   chunk.to_csv(output_path+f"/annotation_test_05_22_part{i}.csv", index=False, header = True,encoding = 'utf-8')

#### Export as one File

In [73]:
sample=random_rows.sample(1000)

In [75]:
sample.to_csv("elinor/annotation_test_06_16.csv", index=False, header = True,encoding = 'utf-8')