## Aim

Create a sample of sentences containing FORM entity False Positive strings (i.e., strings that look like FORMs but are not) to train the annotation prodigy-spacy model.

These cases will be extracted from the following document types:
-'html_publication', 
- 'research', 
- 'aaib_report', 
- 'employment_tribunal_decision',
- 'research',
- 'impact_assessment',
- 'notice'

### Requirements

Please down a copy of the pre-processed content store, following [these](./src/strata/README.md) instructions.





In [None]:
import os
import gc
import re
import pandas as pd
from tqdm.notebook import tqdm

import sys
import json

from src.make_data.make_data import load_preprocessed_content_store
from src.strata.sample_paths_by_strata import get_stratified_sample

pd.set_option('max_colwidth', 400)


In [None]:
DIR_OUTPUT = os.environ.get('DIR_DATA_PROCESSED')

In [None]:
DIR_OUTPUT

## User-defined elements

In [None]:
OUPUT_FILEPATH = os.path.join(DIR_OUTPUT, 'fake_forms_sentences.jsonl')

In [None]:
TARGET_DOCUMENT_TYPES = ['html_publication', 
                         'research', 
                         'aaib_report', 
                         'employment_tribunal_decision',
                         'research',
                         'impact_assessment',
                         'notice'
                        ]

In [None]:
STRATA_WEIGHTS = {'html_publication': 1, 
                         'research' : 1 , 
                         'aaib_report' : 1, 
                         'employment_tribunal_decision' : 1,
                         'research' : 1,
                         'impact_assessment' : 1,
                         'notice' : 1}

## Load the content data

In [None]:
df = load_preprocessed_content_store(path_to_gz='/tmp/govukmirror/preprocessed_content_store_250522.csv.gz')

### Filter for the rekevant document types

These are document types that are likley to contain strings that look like FORMs but are not.

In [None]:
df.columns

In [None]:
target_content_df = df.query("document_type in @TARGET_DOCUMENT_TYPES")

In [None]:
target_content_df.shape

In [None]:
# free memory
del df
gc.collect()

Filter relevant columns

In [None]:
target_content_df = target_content_df[['base_path', 'content_id', 'title', 
                                       'description', 'text', 'document_type']].copy()

In [None]:
target_content_df.head(3)

## Identify strings that look-alike FORM

In [None]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def has_alpha(inputString):
    return any(char.isalpha() for char in inputString)

def has_special(inputString):
    return any(not char.isalnum() for char in inputString)
    
def detectFormName(inputString):
    inputString = inputString.replace(':', '')
    outputString = [token for token in inputString.split() if has_numbers(token) and has_alpha(token) and has_special(token)]
    if outputString:
        return inputString

### In titles

In [None]:
title_results = []
for doc_type, base_path, cid, title in zip(target_content_df['document_type'],
                                 target_content_df['base_path'], 
                                 target_content_df['content_id'], 
                                 target_content_df['title']):
    # extract the sentences from the page with a crude heuristic, then iterate over those
    try: 
        out = detectFormName(title)
        if out:
            title_results.append((out, base_path, cid, doc_type))
    except AttributeError as e:
        continue


### Extracting sentences containing fake forms

#TODO: refactor into function and improve performance of code

In [None]:
results = []
for doc_type, base_path, cid, text in zip(target_content_df['document_type'],
                                target_content_df['base_path'], 
                                target_content_df['content_id'], 
                                target_content_df['text']):
    # extract the sentences from the page with a crude heuristic, then iterate over those
    try: 
        sents = re.split(r' *[\.\?!][\'"\)\]]* *', text)
        for sent in sents:
            out = detectFormName(sent)
            if out:
                results.append((out, base_path, cid, doc_type))
    except AttributeError as e:
        continue
    except TypeError as e:
        continue

In [None]:
len(results)

Join the two

In [None]:
fake_forms_results = title_results + results

In [None]:
len(fake_forms_results)

In [None]:
fake_forms_results_df = pd.DataFrame(fake_forms_results, columns=['text', 'base_path', 'content_id', 'doc_type'])

###  Random sampled stratified by document type

In [None]:
fake_forms_results_df.head()

In [None]:
fake_forms_results_sample = get_stratified_sample(df = fake_forms_results_df, 
                      strata_col = "doc_type", 
                      weights = STRATA_WEIGHTS, 
                      sample_size = 501)

In [None]:
fake_forms_results_sample.shape

In [None]:
# shuffle
fake_forms_results_sample = fake_forms_results_sample.sample(frac=1).copy()
fake_forms_results_sample.head()

### Convert to Prodigy format

A bit of a hack

In [None]:
collection = []
for base_path, cid, text in zip(fake_forms_results_sample['base_path'], 
                                fake_forms_results_sample['content_id'], 
                                fake_forms_results_sample['text']):
    out_dict = {'text': text, 'meta': {'base_path': base_path, 'content_id': cid}}
    collection.append(out_dict)
    

## Write to JSON lines

In [None]:
with open(OUPUT_FILEPATH, 'w') as fp:
    for item in collection:
        fp.write(json.dumps(item, ensure_ascii=False) + "\n")