In [None]:
from pathlib import Path

import spacy
from tqdm import tqdm

from utils.io import load_json, save_json
from utils.text_processing import get_lemma_word

%cd /home/alessandro/coding/PycharmProjects/task-testbed

nlp = spacy.load("en_core_web_lg")
REM_SW = False

# check how many conforming examples we have
structures = load_json(Path("dataset_generation/structures.json"))

test_set = load_json(Path("data/public/puneval/test.json"))

for structure, structure_template in tqdm(structures.items()):
    # if structure not in ["daughter", "never_die", "used"]:
    #     continue
    structure_data = load_json(Path(f"data/public/punny_pattern/{structure}.json"))
    assert len({e["id"] for e in structure_data}) == len(structure_data), "Duplicated IDs"

    lemma_set, word_set = get_lemma_word(structure_template.replace("[X]", "").strip(), nlp, REM_SW)

    # Step 3: Create a set of distinct words
    bad_examples = list()
    good_examples_n = 0
    for example in structure_data:
        text = example["text"]
        # if example["label"] == 1:
        lemma_words_set, example_words_set = get_lemma_word(text, nlp, REM_SW)
        if not set(word_set).issubset(set(example_words_set)):
            bad_examples.append(text)
        else:
            good_examples_n += 1
    print(f"Good examples for '{structure}': ", good_examples_n)
    print(f"Bad examples for '{structure}': ", len(bad_examples))

    out_path = Path(f"dataset_generation/analysis/bad_{structure}.json")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    save_json(out_path, bad_examples)

    test_puns_matching_template = set()
    for test_example in test_set:
        text = test_example["text"]
        # if test_example["label"] == 1:
        lemma_words_set, example_words_set = get_lemma_word(text, nlp, True)
        if set(lemma_words_set).issubset(set(lemma_set)):
            test_puns_matching_template.add(test_example["id"])

    structure_data_ids = {e["id"] for e in structure_data}
    missing_ids = test_puns_matching_template - structure_data_ids
    if missing_ids:
        print(f"Missing TEST puns NOT in '{structure}' set: {len(missing_ids)}")
        save_json(out_path.parent / f"missing_{structure}.json", list(missing_ids))


In [None]:
# SCRIPT TO DETECT DUPLICATES, JUST TO BE USED AS WORKBENCH, NO OUTPUT ON FILE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%cd /home/alessandro/coding/PycharmProjects/task-testbed

def find_duplicate_sentences(sentences, threshold=0.8):
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()

    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(vectors)

    # Find duplicates
    duplicates = set()
    for i in range(len(cosine_sim)):
        for j in range(i + 1, len(cosine_sim)):
            if cosine_sim[i][j] > threshold:  # Check if similarity is above the threshold
                duplicates.add((i, j))

    return duplicates


def read_paragraphs_from_file(file_path):
    paragraphs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip whitespace and check if the line is not empty
            line = line.strip()
            if line:  # Only add non-empty lines
                paragraphs.append(line)
    return paragraphs


from pathlib import Path
from utils.io import load_json

# Example usage
file_path = Path("data/public/punny_pattern/tom.json")
paragraphs = load_json(file_path)

all_examples = [e["text"] for e in paragraphs]

duplicates = find_duplicate_sentences(all_examples, threshold=0.6)

# Print results
for i, j in duplicates:
    print(f"Duplicate sentences found: \n1: {paragraphs[i]} \n2: {paragraphs[j]}\n")

In [None]:
# THIS WILL REMOVE PUNS, so handle with care
# THIS WILL REMOVE PUNS, so handle with care
# THIS WILL REMOVE PUNS, so handle with care
# THIS WILL REMOVE PUNS, so handle with care
# THIS WILL REMOVE PUNS, so handle with care

from utils.io import load_json, save_json
from pathlib import Path
import spacy
from utils.text_processing import get_lemma_word
from tqdm import tqdm

%cd /home/alessandro/coding/PycharmProjects/task-testbed

nlp = spacy.load("en_core_web_lg")
REM_SW = False

# check how many conforming examples we have
structures = load_json(Path("dataset_generation/structures.json"))

for structure, structure_template in tqdm(structures.items()):
    structure_data = load_json(Path(f"data/public/punny_pattern/{structure}.json"))

    lemma_set, word_set = get_lemma_word(structure_template.replace("[X]", "").strip(), nlp, REM_SW)

    # Step 3: Create a set of distinct words
    bad_examples = list()
    good_examples = list()
    for example in structure_data:
        text = example["text"]
        # if example["label"] == 1:
        lemma_words_set, example_words_set = get_lemma_word(text, nlp, REM_SW)
        if not set(word_set).issubset(set(example_words_set)):
            bad_examples.append(text)
        else:
            good_examples.append(example)
    print(f"Good examples for '{structure}': ", len(good_examples))
    print(f"Bad examples for '{structure}': ", len(bad_examples))

    save_json(Path(f"data/public/punny_pattern/{structure}.json"), good_examples)