In [None]:
import json
import os

import jsonlines
import pandas as pd
import spacy
from dotenv import load_dotenv  # pip install python-dotenv
from tqdm import tqdm
from tqdm.auto import tqdm  # for notebooks

# make sure a .env file exists in the same directory, with a line like this:
# KG_PWD=<insert password here>
load_dotenv()
pd.set_option("display.max_columns", None)
tqdm.pandas()

In [None]:
from src.make_data import make_data

In [None]:
df = make_data.load_preprocessed_content_store(
    path_to_gz="/tmp/govukmirror/preprocessed_content_store_250522.csv.gz"
)

In [None]:
df_trim = make_data.trim_dataframe(df, columns=["base_path", "content_id", "text"])

In [None]:
df_trim = df_trim.dropna()

In [None]:
searchfor = ["rebate", "refund", "allowance"]
df_trim_find = df_trim[df_trim["text"].str.contains("|".join(searchfor))]

In [None]:
df_trim_find.shape

In [None]:
df_trim_find.iloc[0]["text"]

In [None]:
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher

nlp = English()
ruler = nlp.add_pipe("entity_ruler")

ruler.from_disk(os.path.join(os.environ.get("DIR_DATA_INTERIM"), "col-patterns.jsonl"))

In [None]:
doc1 = nlp("You could be due a rebate if...")
print([(ent.text, ent.label_) for ent in doc1.ents])

In [None]:
def find_pattern_match(text):
    doc = nlp(text)
    # print(doc.ents)
    # print([(ent.text, ent.label_) for ent in doc.ents])
    return [ent.text for ent in doc.ents]

In [None]:
def contains_pattern_match(text):
    doc = nlp(text)
    # print(doc.ents)
    # print([(ent.text, ent.label_) for ent in doc.ents])
    if not (doc.ents):
        return False
    else:
        return True

In [None]:
contains_pattern_match("no rebate in sentence")

In [None]:
df_trim_samp = df_trim.sample(10000)
df_trim_samp["col_vocab"] = df_trim_samp["text"].progress_apply(
    lambda x: contains_pattern_match(x)
)

In [None]:
df_trim_samp["col_vocab"].value_counts()

In [None]:
col_texts = df_trim_samp[df_trim_samp["col_vocab"] == True]

In [None]:
col_texts = make_data.text_col_to_sents(col_texts, "text", "sentences")

In [None]:
make_data.sentences_to_jsonl(
    col_texts,
    sentence_col="sentences",
    meta_cols=["base_path", "content_id"],
    outfile=os.path.join(os.environ.get("DIR_DATA_INTERIM"), "col-sentences.jsonl"),
)