## Preparing Validation Data for DDB Tagger

- Sample from as broad a range of texts as possible, excluding DanAvis20; Botxt; Sønderjysk
- Choose random sentences that are long enough to have decent context on either side, aim for ±10 
- Ideally all words in the same sentence
- Only pick words that have more than 3 possible categories
- Only include nouns as the target (with the option of making a similar table which includes only verbs and one which includes only adjectives)

In [1]:
import os, sys
import pandas as pd
import random
from tqdm import tqdm

import spacy
nlp = spacy.load("da_core_news_sm")

sys.path.append("..")
from src.DDB_tagger import DDB_tagger
T = DDB_tagger()

In [42]:
def get_possible_sentences(file, text, context_size, target_pos): 

    possible = []
    possible_tagged = []

    # -- GET POSSIBLE TAGETS/SENTENCES --

    # split document into sentences (which do not contain a line break)
    doc_line = nlp(text)
    sentences = [str(sent) for sent in doc_line.sents]
    sentences_nobreak = [sent for sent in sentences if "\n" not in sent]

    # loop over sentences
    for sent in sentences_nobreak: 
        # split sentence into tokens
        doc_sent = nlp(sent)
        # get tokens and pos tags, if token is not punc or a space
        tokens_pos = [(token.text, token.pos_) for token in doc_sent if token.is_punct == False and token.is_space == False]

        # if there are more than twice the context +1 number of tokens:
        if len(tokens_pos) >= (context_size + context_size + 1):
            # loop through the tokens
            for token in tokens_pos:
                # if there are a noun and have at the sufficent number of tokens (context_size) before and after, add it as a possible target - and token is not just a letter
                if token[1] == target_pos and tokens_pos.index(token) > context_size and tokens_pos.index(token) < (len(tokens_pos) - context_size) and len(token[0]) > 1:
                    possible.append({"TARGET": token[0], "SENT": sent, "FILE": file})

    # -- TAG POSSIBLE TARGETS/SENTENCES AND FILTER OUT IF LESS THAN 4 TAGS --

    for p in possible: 
        # tag the possible sentences
        all_tagged = T.tag_text(p["SENT"], only_top3_results=False, only_tagged_results=True)
        # get the tagged target
        target_tagged = all_tagged[all_tagged["TOKEN"] == p["TARGET"]].reset_index()
        # if the target has 4 or more tags
        if target_tagged.at[0, "DDB4+"] != "-":
            # add it as a possible tagged target/sentence
            target_dict = dict(p, **{'DDB1': target_tagged.at[0, "DDB1"], 
                                     "DDB2": target_tagged.at[0, "DDB2"], 
                                     "DDB3": target_tagged.at[0, "DDB3"],
                                     "DDB4+": target_tagged.at[0, "DDB4+"]})
            
            # add to list (if not already in there (could happen if token occurs twice in sentence, then only the first occurance will be taken))
            if target_dict not in possible_tagged:
                possible_tagged.append(target_dict)

    return possible_tagged

### DAGW Corpus

**Legal Documents:**
- Retsinformation (`retsinformationdk`)
- Skat.dk (`skat`)
- (`retspraksis`)

**Social Media**
- Hestenettet (`hest`)
- General Discussions (``)
- Parliament Elections (``)

**Conversation**
- OpenSubtitles (`opensub`)
- Folketinget (`ft`)
- Europarl (`ep`)
- Spontaneous speech (`spont`) 
- NAAT (`naat`)

**Web**
- Common Crawl (`cc`)

**Wiki & Books**
- Wikipedia (`wiki`)
- Danish Literature (`adl`)?
- Gutenberg (`gutenberg`)
- WikiBooks (`wikibooks`)
- WikiSource (`wikisource`)
- Johannes V. Jensen (`jvj`)
- Religious texts (`relig`)

**News**
- TV2R (`tv2r`)
- DanAvis (`danavis`) - don't use

In [44]:
random.seed(1)

df = pd.DataFrame(columns=['TARGET', 'SENT', 'FILE', 'DDB1', 'DDB2', 'DDB3', 'DDB4+'])

sektioner = [["retsinformationdk", "skat", "retspraksis"],
             ["hest"],
             ["opensub", "ft", "ep", "spont", "naat"],
             ["cc"],
             ["wiki", "adl", "gutenberg", "wikibooks", "wikisource", "jvj", "relig"],
             ["tv2r"]]

for group in sektioner:

    print("SEKTION:", group)
    group_files = []
    for sektion in group:
        dir = f"../../DAGW/sektioner/{sektion}/"
        prefix = f"{sektion}_"
        # get the files of the sektion
        files = sorted([os.path.join(dir, file) for file in os.listdir(dir) if file.startswith(prefix)])
        print(len(files))
        group_files = group_files + files

    # get possibile targets/sentences for the sektion
    sek_possibilities = []
    n = len(sek_possibilities)

    # sample from files until 20 samples is reached
    while n < 20: 

        # sample a file from the sektion
        file = random.sample(group_files, 1)[0]
        group_files.remove(file)

        # read the first 1000 characters of the file and get the possibiltities
        text = open(file, "r").read()[:1000]
        file_possibilities = get_possible_sentences(file, text, 10, "NOUN")
        sek_possibilities = sek_possibilities + file_possibilities
        n = len(sek_possibilities)

    # put into data frame
    out = pd.DataFrame(sek_possibilities[:20])
    df = pd.concat([df, out], ignore_index=True, sort=False)

SEKTION: ['retsinformationdk', 'skat', 'retspraksis']
64043
14716
4442
SEKTION: ['hest']
14498
SEKTION: ['opensub', 'ft', 'ep', 'spont', 'naat']
32242
1315
4213
411
129
SEKTION: ['cc']
594
SEKTION: ['wiki', 'adl', 'gutenberg', 'wikibooks', 'wikisource', 'jvj', 'relig']
425938
498
66
1559
2668
42
66
SEKTION: ['tv2r']
49137


In [45]:
# highlight the target in the sentence
df['SENT'] = df.apply(lambda row : row["SENT"].replace(f" {row.TARGET}",  f" <{row.TARGET}> ", 1), axis = 1)
# apply empty column for rating
df["RATING"] = ""
# save
df.to_csv("example.csv")

In [46]:
df

Unnamed: 0,TARGET,SENT,FILE,DDB1,DDB2,DDB3,DDB4+,RATING
0,vægt,"Ankestyrelsen fandt, at der generelt skulle se...",../../DAGW/sektioner/retsinformationdk/retsinf...,"11|011|Måle, regne",13|007|Fysik,"15|055|Kendt, indflydelsesrig","[17|018|Kampsport, 17|012|Atletik, 22|017|Astr...",
1,grund,"Skatterådet var principielt enigt med SKAT i, ...",../../DAGW/sektioner/skat/skat_SKM2014.563.SR,"11|015|Grundsætning, princip","14|005|Tegning, maleri",03|015|Understøtte,"[04|015|Lavvandet, 12|035|Argumentere, bevise,...",
2,sammenhæng,"Heri er aftalt, at regeringen efter dialog med...",../../DAGW/sektioner/retsinformationdk/retsinf...,"05|014|Relation, sammenhæng",11|012|Logisk tænkning,"04|031|Forene, samle",[04|039|Helhed],
3,sagerne,"Hun begrundede det især med, at tre UVVU-medle...",../../DAGW/sektioner/retsinformationdk/retsinf...,"20|008|Besidde, eje",21|009|Forbrydelse,"21|023|Domstol, ret","[05|011|Omstændighed, forhold, 18|014|Offentli...",
4,lov,"I medfør af § 1, stk. 2, 4 og 5, § 1 a, stk. 1...",../../DAGW/sektioner/retsinformationdk/retsinf...,"13|002|Forskning, videnskab",21|017|Lov,21|020|Rettighed,"[22|010|Bøn, fromhed]",
...,...,...,...,...,...,...,...,...
115,klub,Vi vil gerne bevare den nuværende spillertrup ...,../../DAGW/sektioner/tv2r/tv2r_17968,17|026|Fornøjelser og fritidsaktiviteter,17|002|Sportsorganisation,18|012|Institution,"[20|031|Butik, 20|041|Virksomhed, ledelse]",
116,behandling,"En 16-årig passager blev så hårdt medtaget, at...",../../DAGW/sektioner/tv2r/tv2r_166789,"18|014|Offentlig forvaltning, tilsyn","02|040|Behandling, helbredelse","19|024|Råstoffer, materialer","[19|018|Computer, 12|024|Samtale]",
117,grænsen,Projektet har skabt strid intern i det danske ...,../../DAGW/sektioner/tv2r/tv2r_113501,"04|001|Størrelse, omfang","03|022|Grænse, rand","18|002|Stat, nation","[11|010|Skelne, 05|024|Modsætning, 19|011|Teknik]",
118,pris,Først i marts måned tabte Kære Pleje en sag ti...,../../DAGW/sektioner/tv2r/tv2r_19483,05|049|Modgang,15|019|Anerkendelse,16|011|Tobak,"[22|010|Bøn, fromhed]",
