In [None]:
pip install openai

In [None]:
pip install pandas

In [None]:
%pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
pip install ftfy

In [None]:
# OpenAI CALL
def ask_openai(prompt, model, api_key):
    url = 'https://api.openai.com/v1/chat/completions'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
    }
    data = {
        'model': model,
        'messages': [
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': prompt}
        ]
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        return f"Error: {response.status_code} - {response.text}"

In [None]:
from pprint import pprint
# Modified main function
def extract_passage_triples(text, api_key, model="gpt-4"):

    # Prompt for the entire passage
    prompt = f"""You are an expert in natural language processing and knowledge extraction.
    Your task is to extract concise and meaningful ["subject", "predicate", "object"] triples from the following passage.

    Instructions:
    - Each triple must represent a core idea using only key terms (avoid function words or excessive detail).
    - Format: [Subject, Predicate, Object].
    - Use concise keywords or noun phrases for the subject and object.
    - Use the main verb or verbal phrase as the predicate.
    - Do not repeat triples that convey the same meaning.
    - Do not add explanations or extra text.
    - Enclose locutions (titles, citations, etc.) in quotation marks (" ") since they are important concepts.

    Extract the triples from this passage: {text}"""

    triple = ask_openai(prompt, model, api_key)

    results = [{
        "sentence": text,
        "triples": triple.strip()
    }]

    return results


In [None]:
import json
import pandas as pd
from ftfy import fix_text
import spacy
import requests
import random

nlp = spacy.load("en_core_web_sm")

# ======= Extract meaningful words ======= #
def extract_meaningful_words(results, original_text):
    final_words = set()
    
    doc_original = nlp(original_text)

    # Count real words in the text (exclude punctuation, spaces)
    total_text_words = [
        token for token in doc_original
        if not token.is_punct and not token.is_space
    ]
    total_words = len(total_text_words)

    # Entities
    for ent in doc_original.ents:
        ent_doc = nlp(ent.text)
        for token in ent_doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                final_words.add(token.text.lower())  # <--- use token.text, not lemma_

    # Words from triples (only NOUN, PRON, VERB, ADJ)
    for r in results:
        triple_doc = nlp(r["triple"])
        for token in triple_doc:
            if token.pos_ in {"NOUN", "PRON", "VERB", "ADJ", "ADV", "PROPN"}:
                if not token.is_stop and not token.is_punct and not token.is_space:
                    final_words.add(token.text.lower())

    extracted_words = list(final_words)
    extracted_count = len(extracted_words)
    percentage = (extracted_count / total_words * 100) if total_words > 0 else 0

    return extracted_words, extracted_count, total_words, percentage


# Load JSON data
with open("clapnqans.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

api_key = API_KEY
final_results = []
i = 0
for item in data:
    passage_id = item["id"]
    passage_text = item["passages"][0]["text"]
    text = fix_text(passage_text)

    results = extract_passage_triples(text, api_key)

    words, n_words, n_text, percentage = extract_meaningful_words(results, text)

    if percentage <= 45:
        # Already OK
        final_results.append({
            "id": passage_id,
            "text": text,
            "triple": results[0]["triple"] if results else "FAILED",
            "words": words,
            "n_words": n_words,
            "n_text": n_text,
            "percentage": round(percentage, 2)
        })
    else:

        # Calculate how many words to keep to respect max 45%
        max_allowed_words = int(n_text * 0.45)

        if len(words) > max_allowed_words:
            words = random.sample(words, max_allowed_words)

        final_results.append({
            "id": passage_id,
            "text": text,
            "triple": results[0]["triple"] if results else "FAILED",
            "words": words,
            "n_words": len(words),
            "n_text": n_text,
            "percentage": round((len(words) / n_text * 100), 2) if n_text > 0 else 0
        })

# ======= Save to CSV ======= #
df = pd.DataFrame(final_results)
df.to_csv("words_to_modify.csv", index=False, encoding="utf-8")
