## Aim

Create a stratified (previously by taxons and document_type) sample of sentences to annotate, some of which are likely to contain instances of the following Entity Types:

- FORM 
- LOCATION (PROPER NOUN / GPE)
- ORG (PROPER NOUN)
- PERSON (PROPER NOUN)
- POSTCODE
- EMAIL
- PHONE N
- DATE
- MONEY £ (AMOUNT)

We will include:

- ALL the extracted contact details
- 1000 titles with a ratio of 2.5:1 likley-to-contain-a-target-entity : likely-NOT-to-contain-a-target-entity  
- 3000 sentences from body text with a ratio of 2.5:1 likley-to-contain-a-target-entity : likely-NOT-to-contain-a-target-entity  

We will then create two samples to annotate to that it is easier to share the workload.


### Requirements

A stratified random sample of pagepath's and their content, obtained from the MongoDB copy of the Content Store.
Please see `src/make_strata/README.md`.

This will ensure the input file `{SAMPLE_DATE}_stratified_sample_all_content.csv` exists.

In [None]:
import gc
import json
import os
import re
import sys
import time
from typing import List

import dask.dataframe as dd
import pandas as pd
import spacy
from tqdm.notebook import tqdm

from src.make_data.make_data import text_col_to_sents
from src.make_strata.sample_paths_by_strata import get_stratified_sample

pd.set_option("max_colwidth", 400)

nlp = spacy.load(
    "en_core_web_lg",
    disable=["tok2vec", "tagger", "parser", "lemmatizer", "attribute_ruler"],
)
nlp.add_pipe("sentencizer")

# check what pipeline components will be applies
nlp.pipeline

In [None]:
DIR_INPUT = "../../src/make_strata/data"
DIR_OUTPUT = os.environ.get("DIR_DATA_PROCESSED")

In [None]:
DIR_OUTPUT

## User-defined elements

In [None]:
SAMPLE_DATE = "20220627"

In [None]:
N = 1

In [None]:
OUPUT_FILE1 = f"{SAMPLE_DATE}_phase1_extra_training_examples_n{N}_p1.jsonl"
OUPUT_FILE2 = f"{SAMPLE_DATE}_phase1_extra_training_examples_n{N}_p2.jsonl"

In [None]:
OUPUT_FILEPATH1 = os.path.join(DIR_OUTPUT, OUPUT_FILE1)
OUPUT_FILEPATH2 = os.path.join(DIR_OUTPUT, OUPUT_FILE2)

In [None]:
OUPUT_FILEPATH

## Load the data

In [None]:
os.getcwd()

In [None]:
INPUT_FILE = os.path.join(DIR_INPUT, f"{SAMPLE_DATE}_stratified_sample_all_content.csv")

In [None]:
INPUT_FILE

In [None]:
df = pd.read_csv(INPUT_FILE)

In [None]:
df.columns

In [None]:
df.shape

## Extract Contact details

We will add to the example set all the extracte contact details from `contact` and `organisation` pages. 

In [None]:
contact_details_df = df.dropna(subset=["contact_details"]).copy()

In [None]:
# filter relevant column
contact_details_df = contact_details_df[
    ["base_path", "document_type", "schema_name", "contact_details"]
]

## Extract text and process it to sentences

#### (1) From block of text to list of sentences as string

In [None]:
print("Preprocessing sentences...")
tic = time.perf_counter()
ddf = dd.from_pandas(df, npartitions=os.cpu_count())
res = ddf.map_partitions(lambda df: text_col_to_sents(df, "text"))
out = res.compute()
df = df.assign(sentences=out)
toc = time.perf_counter()
print(f"Preprocessing of sentences - Completed in in {toc - tic:0.4f} seconds")

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df[["base_path", "document_type", "text", "sentences"]][:4]

#### (2) Explode the dataframe by sentence, so that we have one sentence per row

In [None]:
# one sentence per row
exploded_df = df[["base_path", "document_type", "text", "sentences"]].explode(
    "sentences"
)

In [None]:
exploded_df.shape

In [None]:
exploded_df[["base_path", "document_type", "text", "sentences"]].sort_values(
    by=["base_path"]
)[:20]

#### (3) Convert `sentences` to spacy NLP objects 

In [None]:
# Exclude None rows
exploded_df = exploded_df[~exploded_df.sentences.isnull()]
exploded_df.shape

In [None]:
nlp_sentences = [nlp(sentence) for sentence in exploded_df.sentences]

In [None]:
len(nlp_sentences)

#### (4) Flag whether a sentence is likley to contain at least one of the "target entities"

In [None]:
TARGET_ENTITIES = set(["DATE", "GPE", "LOC", "FAC", "MONEY", "ORG", "PERSON"])

In [None]:
# contain possible target entity?
contain_target_entity = []
for doc in nlp_sentences:
    if any(ent.label_ in TARGET_ENTITIES for ent in doc.ents):
        contain_target_entity.append(True)
    else:
        contain_target_entity.append(False)

In [None]:
len(contain_target_entity)

In [None]:
# Number of sentences that potentially contain one of the target entities
sum(contain_target_entity)

#### (5) Merge flags  back to original dataframe for sampling

In [None]:
exploded_df["contains_target_entities"] = contain_target_entity

In [None]:
exploded_df.head()

## Extract titles


In [None]:
titles_df = df[["base_path", "document_type", "schema_name", "title"]]

In [None]:
# contains possible Target entities?
titles_df

In [None]:
nlp_titles = [nlp(title) for title in titles_df.title]

In [None]:
title_contain_target_entity = []
for doc in nlp_titles:
    if any(ent.label_ in TARGET_ENTITIES for ent in doc.ents):
        title_contain_target_entity.append(True)
    else:
        title_contain_target_entity.append(False)

In [None]:
sum(title_contain_target_entity)

In [None]:
titles_df["contains_target_entities"] = title_contain_target_entity

In [None]:
titles_df.head()

###  Random sampled stratified by document type

We will include:
- ALL the contact details
- 1000 titles with a ratio of 2.5:1 likley-to-contain-a-target-entity : likely-NOT-to-contain-a-target-entity  
- 3000 sentences from body text with a ratio of 2.5:1 likley-to-contain-a-target-entity : likely-NOT-to-contain-a-target-entity  

We will then create and export two samples to annotate to that it is easier to share the workload.

In [None]:
STRATA_WEIGHTS = {True: 2.5, False: 1}

Titles

In [None]:
SEED_TITLES = 111
rand_sample_titles = get_stratified_sample(
    df=titles_df,
    strata_col="contains_target_entities",
    weights=STRATA_WEIGHTS,
    sample_size=600,
    seed=SEED_TITLES,
)
rand_sample_titles.shape

Texts

In [None]:
SEED_TEXT = 222
rand_sample_texts = get_stratified_sample(
    df=exploded_df,
    strata_col="contains_target_entities",
    weights=STRATA_WEIGHTS,
    sample_size=2000,
    seed=SEED_TEXT,
)

In [None]:
rand_sample_texts.shape

Contact details

In [None]:
contact_details_df.shape

#### Merged the random samples

Ensure consistency of columns

In [None]:
rand_sample_texts = (
    rand_sample_texts[["base_path", "document_type", "sentences"]]
    .rename(columns={"sentences": "text"})
    .reset_index(drop=True)
)

In [None]:
rand_sample_titles = (
    rand_sample_titles[["base_path", "document_type", "title"]]
    .rename(columns={"title": "text"})
    .reset_index(drop=True)
)

In [None]:
contact_details = contact_details_df.rename(columns={"contact_details": "text"})[
    ["base_path", "document_type", "text"]
].reset_index(drop=True)

In [None]:
total_rand_sample = pd.concat(
    [rand_sample_texts, rand_sample_titles, contact_details], ignore_index=True
)

In [None]:
total_rand_sample.shape

Shuffle rows

In [None]:
# shuffle
total_rand_sample = total_rand_sample.sample(frac=1).copy()
total_rand_sample.head()

### Convert to Prodigy format

In [None]:
def from_df_to_jsonl(data: pd.DataFrame):
    """"""
    collection = []
    for base_path, doc_type, text in zip(
        data["base_path"], data["document_type"], data["text"]
    ):
        out_dict = {
            "text": text,
            "meta": {"base_path": base_path, "doc_type": doc_type},
        }
        collection.append(out_dict)
    return collection

In [None]:
def split_list(a_list):
    half = len(a_list) // 2
    print(half)
    return a_list[:half], a_list[half:]

In [None]:
output_jsonl = from_df_to_jsonl(total_rand_sample)

In [None]:
len(output_jsonl)

Split it into 2

In [None]:
rand_sample1, rand_sample2 = split_list(output_jsonl)

## Write to JSON lines

In [None]:
def export_to_jsonl(file: List[dict], output_filpath: str):
    with open(output_filpath, "w") as fp:
        for item in file:
            fp.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
export_to_jsonl(rand_sample1, OUPUT_FILEPATH1)

In [None]:
export_to_jsonl(rand_sample2, OUPUT_FILEPATH2)