In [None]:
import json
import os

import jsonlines
import pandas as pd
import spacy
from dotenv import load_dotenv  # pip install python-dotenv
from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks

# make sure a .env file exists in the same directory, with a line like this:
# KG_PWD=<insert password here>
load_dotenv()
pd.set_option('display.max_columns', None)
tqdm.pandas()

In [None]:
DIR_SRC_STRATA = os.environ.get('DIR_SRC_STRATA')
os.listdir(DIR_SRC_STRATA)

In [None]:
os.listdir('/tmp/govukmirror/')

In [None]:
df = pd.read_csv('/tmp/govukmirror/preprocessed_content_store_250522.csv.gz', compression='gzip', header=0, sep="\t")

In [None]:
df.tail()

In [None]:
DIR_SRC_STRATA = os.environ.get('DIR_SRC_STRATA')

base_paths_df = pd.read_csv(os.path.join(DIR_SRC_STRATA, 'data/schemas_stratified_random_sample.csv'))
base_paths_list = list(base_paths_df['base_path'])
len(base_paths_list)

In [None]:
pp_contentstore = df.copy()

In [None]:
pp_contentstore_filt = pp_contentstore[pp_contentstore['base_path'].isin(base_paths_list)]

In [None]:
pp_contentstore_filt.shape

In [None]:
pp_contentstore

In [None]:
for i,v in pp_contentstore_filt.iterrows():
    
    print(v['base_path'])
    print(v['text'])
    print("*"*20)

## 3. Split to sentences

In [None]:
nlp = spacy.load("en_core_web_md")
def text_to_sents(text):
    doc = nlp(text)
    assert doc.has_annotation("SENT_START")
    sent_list = [sent.text for sent in doc.sents]
    return sent_list

In [None]:
pp_contentstore_trim = pp_contentstore_filt[['base_path', 'content_id', 'text']]

In [None]:
pp_contentstore_trim['text'] = pp_contentstore_trim['text'].progress_apply(lambda x: str(x))

In [None]:
pp_contentstore_trim['sentences'] = pp_contentstore_trim['text'].progress_apply(lambda x: text_to_sents(x))

In [None]:
pp_contentstore_trim.head()

In [None]:
for idx, sent in pp_contentstore_trim.head(2).iterrows():
    print(sent['sentences'])
    print('*'*20)

## 4. Get into Prodigy format

In [None]:
[
  {"text": "This is a sentence", "meta": {"base_path": "/some/path", "content_id": 1234}},
  {"text": "This is another sentence", "meta": {"base_path": "/some/other/path", "content_id": 5678}},
]

In [None]:
# dict_lines = []
# for i, row in tqdm(pp_contentstore_trim.iterrows()):
#     base_path = row['base_path']
#     c_id = row['content_id']
#     for sentence in row['sentences']:
#         dict_line = {"text": sentence, "meta": {"base_path": base_path, "content_id": c_id}}
#         dict_lines.append(dict_line)
# with open('file.jsonl', 'w') as jsonlfile:
#     jsonlfile.write('\n'.join(json.dumps(i) for i in dict_lines))

def sentences_to_jsonl(dataframe, sentence_col, meta_cols, outfile):
    dict_lines = []
    for i, row in tqdm(dataframe.iterrows()):
        for sentence in row[sentence_col]:
            #dict_line = {"text": sentence, "meta": {"base_path": base_path, "content_id": c_id}}
            dict_line = {"text": sentence, "meta": {i: row[i] for i in meta_cols}}
            dict_lines.append(dict_line)
    with open(outfile, 'w') as jsonlfile:
        jsonlfile.write('\n'.join(json.dumps(j) for j in dict_lines))

In [None]:
sentences_to_jsonl(dataframe=pp_contentstore_trim, sentence_col='sentences', meta_cols=['base_path', 'content_id'], outfile='func.jsonl')

In [None]:
dict_lines[:5]