# Natural Language Processing

### Project: Detection of Negation and Uncertainty

First look at the dataset structure:

In [3]:

import json

# Load JSON file
with open("negacio_train_v2024.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Print only the first 5 records for training bcs the data set is too large
for record in data[:5]:  
    print(json.dumps(record, indent=4, ensure_ascii=False))


{
    "data": {
        "cmbd": "null",
        "id": "19026587",
        "docid": "null",
        "page": "null",
        "paragraph": "null",
        "text": " nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se rea

In [18]:
#only once
#!pip install spacy
#!python -m spacy download es_core_news_md
#!python -m spacy download ca_core_news_md


Collecting es-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl (42.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: es-core-news-md
Successfully installed es-core-news-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')
Collecting ca-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.8.0/ca_core_news_md-3.8.0-py3-none-any.whl (49.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ca-core-news-md
Successfully installed ca-core-news-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
Y

In [19]:
import spacy
import nltk
import re
import unicodedata
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode

#once
#nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rimeslaoui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
# Load spaCy models for spanish and catalan
try:
    nlp_es = spacy.load("es_core_news_md")  # Spanish model
    nlp_ca = spacy.load("ca_core_news_md")  # Catalan model
except:
    import os
    os.system("python -m spacy download es_core_news_md")
    os.system("python -m spacy download ca_core_news_md")
    nlp_es = spacy.load("es_core_news_md")
    nlp_ca = spacy.load("ca_core_news_md")

**Preprocessing text**


In [26]:
from spacy.lang.es.stop_words import STOP_WORDS as stopwords_es
from spacy.lang.ca.stop_words import STOP_WORDS as stopwords_ca

def preprocess_text(text, lang="es"):
    """
    Returns:
        List of clean tokens with negation words preserved.
    """
    text = text.lower()  # Normalize case
    text = unidecode(text)  # Remove accents

    if lang == "es":
        nlp = nlp_es
        stopwords = stopwords_es
    else:
        nlp = nlp_ca
        stopwords = stopwords_ca

    doc = nlp(text)

    # Tokenize while keeping negation words (Spanish & Catalan)
    tokens = [
        token.text for token in doc
        if token.is_alpha and (token.text not in stopwords or token.text in NEGATION_WORDS)
    ]

    return tokens



**Detecting negation and uncertainty, rule-based system**

In [35]:
NEGATION_WORDS = [
    # Common negation words
    "no", "sin", "ausencia de", "descarta", "descartado", "excluye", "excluido", "niega", "negado",
    "negativa", "negación", "ningún", "ninguna", "ninguno", "imposible", "inhallable", "carece de",
    
    # Medical-specific negation in Spanish
    "sin evidencia de", "no se observa", "no presenta", "no muestra", "no evidencia", "no compatible con",
    "no concluyente", "no parece", "no se detecta", "sin signos de", "sin síntomas de", "sin indicios de",
    "sin hallazgos de", "sin pruebas de", "sin rastro de", "ausente", "no encontrado", "sin cambios",
    "no se aprecian", "no se ven", "descartando", "descartable", "no hay evidencia de", "no hay indicación de",
    "libre de", "exento de", "sin manifestaciones de", "se excluye", "queda descartado", "ninguna evidencia de",
    "ningún signo de", "sin afección", "no identificado", "negado por el paciente", "negado clínicamente",
    "sin enfermedad", "sin afectación", "no afectado", "no positivo", "resultado negativo",
    "resultado no reactivo", "resultado no positivo",
    
    # Medical-specific negation in Catalan
    "sense", "no es detecta", "no es veu", "no hi ha", "no presenta", "sense indicis de", "sense evidència de",
    "sense senyals de", "sense rastre de", "sense afectació", "sense afecció", "no concloent", "sense canvis",
    "sense resultats", "sense manifestacions de", "no s'observa", "no s'aprecia", "sense presència de",
    "no compatible amb", "no és visible", "sense símptomes", "no diagnosticat", "sense senyals clars",
    "diagnòstic negatiu"
]

UNCERTAINTY_WORDS = [
    # Common uncertainty words
    "posible", "quizás", "podría", "sospecha de", "considera", "probable", "aparentemente", "puede", "posiblemente",
    "parece", "se considera", "indeterminado", "probabilidad de", "no concluyente", "eventual", "en estudio",
    "pendiente de evaluación", "sugestivo de", "sugiere", "indica que", "se sospecha de", "podría indicar",
    "dudoso", "no definido", "no específico", "no determinado", "valor incierto", "no claro", "no seguro",
    "compatible con", "aparenta ser", "tendría que evaluarse", "a determinar", "probabilidad baja de",
    "probabilidad alta de", "sin certeza", "hipotético", "hipotéticamente", "a confirmar", "falta de certeza",
    "en posible relación con", "estaría asociado", "aparentemente relacionado con", "se intuye", "se deduce que",
    "en consideración",
    
    # Medical uncertainty in Catalan
    "possible", "potser", "podria", "sospita de", "es considera", "probable", "aparentment", "pot ser",
    "possiblement", "sembla", "es sospita de", "és indeterminat", "probabilitat de", "no concloent", "eventual",
    "en estudi", "pendent d'avaluació", "suggerent de", "suggerix", "indica que", "dubtós", "no definit",
    "no específic", "no determinat", "valor incert", "no clar", "no segur", "aparentment relacionat amb",
    "es dedueix que", "en consideració"
]


In [36]:
def detect_negation(text, lang="es"):
    
    tokens = preprocess_text(text, lang)
    text_clean = " ".join(tokens)
    
    negations_found = [word for word in NEGATION_WORDS if re.search(rf"\b{word}\b", text_clean)]
    
    return negations_found

def detect_uncertainty(text, lang="es"):
    
    tokens = preprocess_text(text, lang)
    text_clean = " ".join(tokens)
    
    uncertainties_found = [word for word in UNCERTAINTY_WORDS if re.search(rf"\b{word}\b", text_clean)]
    
    return uncertainties_found


**Let's test all this on a part of the dataset (bcs its too large)**

In [37]:
import json

# load dataset
with open("negacio_train_v2024.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# process the dataset and store results
processed_data = []
for record in data[:5]:  # Process the first 5 records for testing( we change the nbr of record)
    text = record.get("data", {}).get("text", "")  # Extract medical text
    lang = "es" if "es" in text.lower() else "ca"  

    # negation and uncertainty detection
    negations_detected = detect_negation(text, lang)
    uncertainties_detected = detect_uncertainty(text, lang)

    # Extract ground truth annotations (if available)
    ground_truth_negations = []
    ground_truth_uncertainties = []

    for prediction in record.get("predictions", []):
        for result in prediction.get("result", []):
            label = result["value"]["labels"][0]  # Example: "NEG" or "UNC"
            annotated_text = text[result["value"]["start"]:result["value"]["end"]]

            if label == "NEG":
                ground_truth_negations.append(annotated_text)
            elif label == "UNC":
                ground_truth_uncertainties.append(annotated_text)

     # Store results
    processed_data.append({
        "original_text": text,  # No truncation, full text is stored
        "negations_detected": negations_detected,
        "uncertainties_detected": uncertainties_detected,
        "ground_truth_negations": ground_truth_negations,
        "ground_truth_uncertainties": ground_truth_uncertainties
    })

# Print processed results
for entry in processed_data[:10]:  # Show first 10 processed results
    print(json.dumps(entry, indent=4, ensure_ascii=False))


{
    "original_text": " nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesiones malignas pero se objetiva estenosis de uretra . se intentan dilataciones progresivas en el

It seems that we are only detecting the negation words, let's try to find what words they are liked with to get the full meaning:
* Use dependency parsing → To detect negation scope (not just words).
* Handle multi-word expressions → "no está descartada" should be captured fully.


In [38]:
def detect_negation_improved(text, lang="es"):
    
    #Improved negation detection using dependency parsing.

    doc = nlp_es(text) if lang == "es" else nlp_ca(text)
    negations_found = []

    for token in doc:
        if token.text.lower() in NEGATION_WORDS:
            # Get the phrase in negation scope
            negation_scope = [token.text]
            
            for child in token.head.subtree:
                if child.dep_ not in ("punct", "cc") and child.text.lower() != token.text:
                    negation_scope.append(child.text)
            
            negations_found.append(" ".join(negation_scope))

    return list(set(negations_found))  # Remove duplicates


def detect_uncertainty_improved(text, lang="es"):
    
    #Improved uncertainty detection using dependency parsing.
   
    doc = nlp_es(text) if lang == "es" else nlp_ca(text)
    uncertainties_found = []

    for token in doc:
        if token.text.lower() in UNCERTAINTY_WORDS:
            # Get the phrase in uncertainty scope
            uncertainty_scope = [token.text]
            
            for child in token.head.subtree:
                if child.dep_ not in ("punct", "cc") and child.text.lower() != token.text:
                    uncertainty_scope.append(child.text)

            uncertainties_found.append(" ".join(uncertainty_scope))

    return list(set(uncertainties_found))  # Remove duplicates


In [39]:
import json

# Load dataset
with open("negacio_train_v2024.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Process the dataset and store results
processed_data_improved = []
for record in data[:5]:  # Process the first 5 records for testing (adjustable number)
    text = record.get("data", {}).get("text", "")  # Extract medical text
    lang = "es" if "es" in text.lower() else "ca"  

    # Improved negation and uncertainty detection
    negations_detected_improved = detect_negation_improved(text, lang)
    uncertainties_detected_improved = detect_uncertainty_improved(text, lang)

    # Extract ground truth annotations (if available)
    ground_truth_negations = []
    ground_truth_uncertainties = []

    for prediction in record.get("predictions", []):
        for result in prediction.get("result", []):
            label = result["value"]["labels"][0]  # Example: "NEG" or "UNC"
            annotated_text = text[result["value"]["start"]:result["value"]["end"]]

            if label == "NEG":
                ground_truth_negations.append(annotated_text)
            elif label == "UNC":
                ground_truth_uncertainties.append(annotated_text)

    # Store results
    processed_data_improved.append({
        "original_text": text,  # No truncation, full text is stored
        "negations_detected_improved": negations_detected_improved,
        "uncertainties_detected_improved": uncertainties_detected_improved,
        "ground_truth_negations": ground_truth_negations,
        "ground_truth_uncertainties": ground_truth_uncertainties
    })

# Print processed results
for entry in processed_data_improved[:10]:  # Show first 10 processed results
    print(json.dumps(entry, indent=4, ensure_ascii=False))


{
    "original_text": " nº historia clinica: ** *** *** nºepisodi: ******** sexe: home data de naixement: 16.05.1936 edat: 82 anys procedencia cex mateix hosp servei urologia data d'ingres 24.07.2018 data d'alta 25.07.2018 08:54:04 ates per ***************, *****; ****************, ****** informe d'alta d'hospitalitzacio motiu d'ingres paciente que ingresa de forma programada para realizacion de uretrotomia interna . antecedents alergia a penicilina y cloramfenicol . no habitos toxicos. antecedentes medicos: bloqueo auriculoventricular de primer grado hipertension arterial. diverticulosis extensa insuficiencia renal cronica colelitiasis antecedentes quirurgicos: exeresis de lesiones cutaneas con anestesia local protesis total de cadera cordectomia herniorrafia inguinal proces actual varon de 81a que a raiz de episodio de hematuria macroscopica se realiza cistoscopia que es negativa para lesiones malignas pero se objetiva estenosis de uretra . se intentan dilataciones progresivas en el

Great! We can see that the negation scope handling has improved, and we are now capturing multi-word negations making the outputs more understandable.
However we can still improve some aspect such as: the negation scopes are too short or incorrectly split
and some non-negated phrases were incorrectly classified as negations

Future improvements:
* Use dependency parsing to properly capture negation scope.
* Refine regex rules for multi-word negation handling.
* Filter false positives by applying negation only in relevant medical contexts.