In [1]:
!pip install transformers datasets torch scikit-learn pandas tqdm nltk nlpaug

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collec

In [43]:
! pip install nlpaug unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [61]:
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import BertTokenizer, BertModel, TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords
import nlpaug.augmenter.word as naw
import unidecode

nltk.download("stopwords")
spanish_stopwords = set(stopwords.words("spanish"))

# Set device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Spanish stopwords for TF-IDF
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
# Load data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

def fix_label_format(label_str):
    label_list = list(map(int, label_str.strip("[]").split()))  # Convert to list of integers
    return np.array(label_list)  # Convert to NumPy array for better compatibility

# Apply function
train_df["labels"] = train_df["labels"].astype(str).apply(fix_label_format)

# Verify conversion
print(train_df["labels"].head())
print(type(train_df["labels"][0]))  # Should output <class 'numpy.ndarray'>

0    [0, 0, 0, 0]
1    [0, 0, 1, 0]
2    [0, 0, 0, 0]
3    [0, 0, 0, 0]
4    [0, 0, 0, 0]
Name: labels, dtype: object
<class 'numpy.ndarray'>


In [63]:
clinical_terms = {
    "ACV": "Accidente cerebrovascular",
    "AFG": "Alfa-fetoproteína",
    "AFP": "Alfa-fetoproteína",
    "AINE": "Antiinflamatorio no esteroideo",
    "AINES": "Antiinflamatorios no esteroides",
    "ALT": "Alanina aminotransferasa",
    "ANA": "Anticuerpos antinucleares",
    "ANCA": "Anticuerpos anticitoplasma de neutrófilos",
    "AST": "Aspartato aminotransferasa",
    "AVC": "Accidente vascular cerebral",
    "AVR": "Reemplazo valvular aórtico",
    "BAL": "Lavado broncoalveolar",
    "BID": "Dos veces al día (bis in die)",
    "BNP": "Péptido natriurético cerebral",
    "BUN": "Nitrógeno ureico en sangre",
    "CABG": "Injerto de derivación de arteria coronaria (bypass coronario)",
    "CK": "Creatina quinasa",
    "CMV": "Citomegalovirus",
    "CPAP": "Presión positiva continua en la vía aérea",
    "CRP": "Proteína C reactiva",
    "CSF": "Líquido cefalorraquídeo",
    "CT": "Tomografía computarizada",
    "CVC": "Catéter venoso central",
    "DLP": "Dislipidemia",
    "DM": "Diabetes mellitus",
    "DMI": "Diabetes mellitus tipo I",
    "DMII": "Diabetes mellitus tipo II",
    "DTC": "Disfunción tiroidea congénita",
    "EAM": "Enfermedad arterial miocárdica",
    "ECG": "Electrocardiograma",
    "ECV": "Enfermedad cerebrovascular",
    "EEG": "Electroencefalograma",
    "EPOC": "Enfermedad pulmonar obstructiva crónica",
    "ESR": "Velocidad de sedimentación globular",
    "FAME": "Fármacos antirreumáticos modificadores de la enfermedad",
    "FEVI": "Fracción de eyección del ventrículo izquierdo",
    "FLOT": "Fluorouracilo, leucovorina, oxaliplatino, docetaxel (quimioterapia)",
    "FRCV": "Factores de riesgo cardiovascular",
    "GGT": "Gamma-glutamiltransferasa",
    "HAA": "Hemorragia alveolar aguda",
    "HCG": "Gonadotropina coriónica humana",
    "HDL": "Lipoproteína de alta densidad",
    "HIV": "Virus de inmunodeficiencia humana",
    "HNF": "Hepatonefritis fulminante",
    "HTA": "Hipertensión arterial",
    "IAM": "Infarto agudo de miocardio",
    "IC": "Insuficiencia cardíaca",
    "ICU": "Unidad de cuidados intensivos",
    "IGA": "Inmunoglobulina A",
    "IGE": "Inmunoglobulina E",
    "IGG": "Inmunoglobulina G",
    "IM": "Intramuscular",
    "INR": "Índice internacional normalizado",
    "IPAP": "Presión inspiratoria positiva en la vía aérea",
    "IV": "Intravenoso",
    "IVH": "Hemorragia intraventricular",
    "LDH": "Lactato deshidrogenasa",
    "LDL": "Lipoproteína de baja densidad",
    "MARSA": "Staphylococcus aureus resistente a meticilina (MRSA)",
    "MN": "Meningitis",
    "MPO": "Mieloperoxidasa",
    "MRI": "Imagen por resonancia magnética",
    "MSI": "Inestabilidad microsatelital",
    "NSAID": "Antiinflamatorios no esteroides",
    "OIT": "Oxigenoterapia hiperbárica",
    "PA": "Presión arterial",
    "PAF": "Paroxismo auricular fibrilatorio",
    "PCR": "Proteína C reactiva",
    "PCT": "Procalcitonina",
    "PET": "Tomografía por emisión de positrones",
    "PO": "Vía oral",
    "PRN": "Según sea necesario (pro re nata)",
    "PSA": "Antígeno prostático específico",
    "PTH": "Parathormona",
    "QD": "Cada día (quaque die)",
    "QID": "Cuatro veces al día (quater in die)",
    "QT": "Quimioterapia",
    "RAST": "Prueba radioalergosorbente",
    "RB": "Retinoblastoma",
    "RCP": "Reanimación cardiopulmonar",
    "SC": "Subcutáneo",
    "SIDA": "Síndrome de inmunodeficiencia adquirida",
    "SNC": "Sistema nervioso central",
    "TAC": "Tomografía axial computarizada",
    "TARV": "Terapia antirretroviral",
    "TB": "Tuberculosis",
    "TG": "Triglicéridos",
    "TGO": "Transaminasa glutámico oxalacética",
    "TGP": "Transaminasa glutámico pirúvica",
    "TID": "Tres veces al día (ter in die)",
    "TNF": "Factor de necrosis tumoral",
    "TSH": "Hormona estimulante de la tiroides",
    "UCI": "Unidad de cuidados intensivos",
    "UEI": "Urea en orina de 24 horas",
    "VCI": "Vena cava inferior",
    "VCM": "Volumen corpuscular medio",
    "VEGF": "Factor de crecimiento endotelial vascular",
    "VHB": "Virus de la hepatitis B",
    "VHC": "Virus de la hepatitis C",
    "VRS": "Virus respiratorio sincitial",
}

def expand_medical_terms(text):
    for term, expanded in clinical_terms.items():
        text = re.sub(r"\b" + re.escape(term) + r"\b", expanded, text, flags=re.IGNORECASE)
    return text

train_df["text"] = train_df["text"].apply(expand_medical_terms)
test_df["text"] = test_df["text"].apply(expand_medical_terms)

In [64]:
def clean_text(text):
    if isinstance(text, list):  # Convert list to string if necessary
        text = " ".join(text)

    if not isinstance(text, str):  # Ensure it's a string
        return ""

    text = text.lower()
    text = re.sub(r"[^a-záéíóúüñ\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = " ".join([word for word in text.split() if word not in spanish_stopwords])

    return text

# Apply text cleaning
train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)

# Diacritic Normalization
def normalize_text(text):
    return unidecode.unidecode(text)

train_df["text"] = train_df["text"].apply(normalize_text)
test_df["text"] = test_df["text"].apply(normalize_text)

# Lemmatization
!python -m spacy download es_core_news_sm
import spacy
nlp = spacy.load("es_core_news_sm")  # Ensure you have this model installed

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

train_df["text"] = train_df["text"].apply(lemmatize_text)
test_df["text"] = test_df["text"].apply(lemmatize_text)

# Display samples
train_df.head()

Collecting es-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Unnamed: 0,id,text,labels
0,train_0,presentar caso paciente ano antecedente artrit...,"[0, 0, 0, 0]"
1,train_1,describir caso clinico escolar sexo masculino ...,"[0, 0, 1, 0]"
2,train_2,hombre ano llego servicio urgencias presunto i...,"[0, 0, 0, 0]"
3,train_3,mujer ano natural india residente espana dos a...,"[0, 0, 0, 0]"
4,train_4,presentar caso paciente mujer ano clinico fieb...,"[0, 0, 0, 0]"


In [65]:
nltk.download('averaged_perceptron_tagger_eng')
# Identify rare labels
label_counts = np.sum(np.vstack(train_df["labels"].values), axis=0)
rare_labels = [i for i, count in enumerate(label_counts) if count < 30]  # Threshold: Labels appearing <30 times

# Synonym replacement augmentation
aug = naw.SynonymAug(aug_src="wordnet")

def augment_text(text):
    return aug.augment(text)

# Apply augmentation on rare label samples
augmented_texts = []
augmented_labels = []

for _, row in train_df.iterrows():
    if any(row["labels"][i] == 1 for i in rare_labels):
        new_text = augment_text(row["text"])
        augmented_texts.append(new_text)
        augmented_labels.append(row["labels"])

# Add augmented data
aug_df = pd.DataFrame({"text": augmented_texts, "labels": augmented_labels})
train_df = pd.concat([train_df, aug_df], ignore_index=True)

print(f"Dataset size after augmentation: {len(train_df)}")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Dataset size after augmentation: 551


In [66]:
# Dynamically determine the number of labels
num_labels = len(train_df["labels"][0])  # Get number of labels from first row
label_cols = [f"label_{i}" for i in range(num_labels)]  # Create column names
print("Available columns:", train_df.columns)  # Check all column names
print("Expected label columns:", label_cols)  # Check generated label names

# Ensure labels are properly split
if "labels" in train_df.columns:
    num_labels = len(train_df["labels"][0])  # Get number of labels
    label_cols = [f"label_{i}" for i in range(num_labels)]  # Generate column names

    # Convert lists into separate columns
    train_df[label_cols] = pd.DataFrame(train_df["labels"].tolist(), index=train_df.index)

    # Drop the original labels column
    train_df.drop(columns=["labels"], inplace=True)

# Verify columns again
print("Final label columns:", train_df.columns)


Available columns: Index(['id', 'text', 'labels'], dtype='object')
Expected label columns: ['label_0', 'label_1', 'label_2', 'label_3']
Final label columns: Index(['id', 'text', 'label_0', 'label_1', 'label_2', 'label_3'], dtype='object')


In [67]:
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"

#tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Tokenize function
#def tokenize_texts(texts, max_length=512):
#    return tokenizer(
#        texts.tolist(),
#        padding="max_length",
#        truncation=True,
#        max_length=max_length,
#        return_tensors="pt"
#    )

# Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function
def tokenize_text(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")


In [68]:
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels  # Already converted to float32
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        encoding = self.tokenizer(
            self.texts[index],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        if self.labels is not None:
            label_tensor = torch.tensor(np.array(self.labels[index], dtype=np.float32))  # Ensure float32 conversion
            return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label_tensor}

        return {"input_ids": input_ids, "attention_mask": attention_mask}


In [69]:
from sklearn.model_selection import train_test_split

# Get the label columns dynamically
label_cols = [col for col in train_df.columns if col.startswith('label_')]

# Extract label values as a NumPy array
y_labels = train_df[label_cols].values

# Perform Train-Validation Split
#X_train, X_val, y_train, y_val = train_test_split(
#    train_df['text'], y_labels,  # Use extracted NumPy array
#    test_size=0.2, random_state=42, stratify=y_labels.sum(axis=1)  # Ensure proper stratification
#)
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(train_df["text"], y_labels.sum(axis=1)):  # Use sum of labels for stratification
    X_train, X_val = train_df["text"].iloc[train_idx], train_df["text"].iloc[val_idx]
    y_train, y_val = y_labels[train_idx], y_labels[val_idx]
    break  # Use only first fold, or iterate for full k-fold training

# Verify output before proceeding
print("y_train shape:", y_train.shape)  # Should be (num_samples, num_labels)
print("y_train example:", y_train[:5])  # Should contain lists of 0s and 1s

y_train shape: (440, 4)
y_train example: [[0 0 0 0]
 [0 0 1 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [70]:
train_dataset = ClinicalDataset(X_train.tolist(), y_train, tokenizer)
val_dataset = ClinicalDataset(X_val.tolist(), y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [72]:
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertMultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropped = self.dropout(pooled_output)
        return self.classifier(dropped)  # No sigmoid since BCEWithLogitsLoss expects raw logits

model = BertMultiLabelClassifier(MODEL_NAME, num_labels)
model = model.cuda()

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
# Ensure labels are numeric
train_df[label_cols] = train_df[label_cols].astype(np.float32)

# Compute normalized class weights
class_weights = train_df[label_cols].sum().values
class_weights = class_weights / class_weights.sum()  # Normalize sum to 1
class_weights = torch.tensor(class_weights, dtype=torch.float32).cuda()

LABEL_SMOOTHING = 0.1  # Small smoothing factor

def smooth_labels(labels, epsilon=LABEL_SMOOTHING):
    return labels * (1 - epsilon) + (epsilon / labels.shape[1])

# Apply label smoothing before training
y_train = smooth_labels(y_train)
y_val = smooth_labels(y_val)

criterion = nn.BCEWithLogitsLoss(weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

print("Class weights:", class_weights)  # Verify output


Class weights: tensor([0.2157, 0.1716, 0.3480, 0.2647], device='cuda:0')


In [74]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5):
    best_f1 = 0
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].cuda()
            attention_mask = batch["attention_mask"].cuda()
            labels = batch["labels"].cuda()

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader)}")

        model.eval()
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].cuda()
                attention_mask = batch["attention_mask"].cuda()
                labels = batch["labels"].cuda()

                outputs = model(input_ids, attention_mask)
                preds = torch.sigmoid(outputs).cpu().numpy()
                all_preds.append(preds)
                all_labels.append(labels.cpu().numpy())

        # Convert lists to NumPy arrays before computing F1-score
        all_preds = np.vstack(all_preds)  # Stack predictions vertically
        all_labels = np.vstack(all_labels) # Stack labels vertically

        # Compute macro F1-score
        val_f1 = f1_score(all_labels, all_preds > 0.3, average="macro") # Threshold predictions
        print(f"Epoch {epoch+1} - Macro F1: {val_f1}")

        # Save model if it's the best so far
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), "best_model.pth")
            print("Best model saved!")

In [81]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

100%|██████████| 55/55 [00:42<00:00,  1.30it/s]


Epoch 1, Training Loss: 0.0002401777482274073
Epoch 1 - Macro F1: 0.8604532163742691
Best model saved!


100%|██████████| 55/55 [00:44<00:00,  1.23it/s]


Epoch 2, Training Loss: 0.0002357519999019463
Epoch 2 - Macro F1: 0.8604532163742691


100%|██████████| 55/55 [00:43<00:00,  1.26it/s]


Epoch 3, Training Loss: 0.000229266016156709
Epoch 3 - Macro F1: 0.8765350877192981
Best model saved!


100%|██████████| 55/55 [00:44<00:00,  1.24it/s]


Epoch 4, Training Loss: 0.01123519326991465
Epoch 4 - Macro F1: 0.5118898543317149


100%|██████████| 55/55 [00:43<00:00,  1.25it/s]


Epoch 5, Training Loss: 0.04577396788465028
Epoch 5 - Macro F1: 0.67804446193193


100%|██████████| 55/55 [00:44<00:00,  1.25it/s]


Epoch 6, Training Loss: 0.019355015525467357
Epoch 6 - Macro F1: 0.7494172494172494


100%|██████████| 55/55 [00:43<00:00,  1.25it/s]


Epoch 7, Training Loss: 0.008019773992286486
Epoch 7 - Macro F1: 0.8649546106067846


100%|██████████| 55/55 [00:44<00:00,  1.25it/s]


Epoch 8, Training Loss: 0.003358731254808266
Epoch 8 - Macro F1: 0.8708074534161491


100%|██████████| 55/55 [00:44<00:00,  1.25it/s]


Epoch 9, Training Loss: 0.0031401811194055797
Epoch 9 - Macro F1: 0.8976897117629383
Best model saved!


100%|██████████| 55/55 [00:44<00:00,  1.24it/s]


Epoch 10, Training Loss: 0.001024198009293865
Epoch 10 - Macro F1: 0.9137715831079676
Best model saved!


In [84]:
test_dataset = ClinicalDataset(test_df["text"].tolist(), None, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.load_state_dict(torch.load("best_model.pth"))
model.eval()
test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()

        outputs = model(input_ids, attention_mask)
        preds = torch.sigmoid(outputs).cpu().numpy()
        test_preds.append(preds)

test_preds = np.vstack(test_preds)
test_preds = (test_preds > 0.5).astype(int)  # Convert to binary labels

# Save Submission File
submission = pd.DataFrame(test_preds, columns=label_cols)
submission.insert(0, "id", test_df["id"])
submission.insert(1, "text", test_df["text"])
submission.to_csv("submission.csv", index=False)

  model.load_state_dict(torch.load("best_model.pth"))
100%|██████████| 31/31 [00:08<00:00,  3.82it/s]


In [86]:
# Ensure predictions are formatted as comma-separated values inside square brackets
formatted_preds = ["[" + ", ".join(map(str, row)) + "]" for row in test_preds]

# Save Submission File with the corrected format
submission = pd.DataFrame({"id": test_df["id"], "text": test_df["text"], "pred": formatted_preds})
submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv!")

Submission file saved as submission.csv!
