In [1]:
import os

os.environ["HF_HOME"] = "/projects/bhuang/.cache/huggingface"
# os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import itertools
import math
import shutil
from pathlib import Path

import fasttext
import numpy as np
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [None]:
# drbenchmark_quaero

data_files = {
    "train": [
        # "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-train-cls-mistral_large_instruct_2407-processed.jsonl",
        # "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-train-cls-mistral_large_instruct_2407-processed.jsonl",
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea_medline-train-upsampled65.jsonl",
    ],
    "valid": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-validation-cls-mistral_large_instruct_2407-processed.jsonl",
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-validation-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
    # "test": [
    #     "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-test-cls-mistral_large_instruct_2407-processed.jsonl",
    #     "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-test-cls-mistral_large_instruct_2407-processed.jsonl",
    # ],
    "test_quaero_medline": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-test-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
    "test_quaero_emea": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-test-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
}

In [3]:
# synthetic

data_files = {
    "train": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-train-10k.jsonl",
    ],
    "valid": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-head-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-medium-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-tail-processed-validation.jsonl",
    ],
    "test_synthetic": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-test.jsonl",
    ],
    "test_synthetic_head": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-head-processed-test.jsonl",
    ],
    "test_synthetic_medium": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-medium-processed-test.jsonl",
    ],
    "test_synthetic_tail": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-tail-processed-test.jsonl",
    ],
}

In [4]:
dataset = load_dataset("json", data_files=data_files)
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 4000
    })
    test_synthetic: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_head: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_medium: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_tail: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
})

## Transform data

In [None]:
#!pip install -U nltk
# import nltk
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("punkt_tab")

In [5]:
import re
import unicodedata

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# stopwords_list = stopwords.words("english") + stopwords.words("french")
stopwords = set(stopwords.words("french"))

# default word analyzer used in TfidfVectorizer
word_token_pattern = re.compile(r"(?u)\b\w\w+\b")

# init SnowballStemmer
stemmer = SnowballStemmer("french")


# adapted to optionally keep selected symbols
def remove_symbols(s: str, keep: str = ""):
    """
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    """
    # fmt: off
    return "".join(
        c
        if c in keep
        else " "
        if unicodedata.category(c)[0] in "MSP"
        else c
        for c in unicodedata.normalize("NFKC", s)
    )
    # fmt: on


def normalize_text(s):
    s = s.lower()  # lowercase

    # normalize punkt
    """
    s = unicodedata.normalize("NFKD", s)  # normalize unicode chars
    s = re.sub(r"[´′’ʼ‘ʻ`]", "'", s)  # standardize quotes and apostrophes
    s = re.sub(r"[−‐–—]", "-", s)  # standardize hyphens and dashes
    s = re.sub(r"\s*'\s*", "' ", s)  # add space after apostrophe
    s = re.sub(r"\s*([,.:;!?])", r" \1", s)  # add space before comma/period
    s = re.sub(r"\s*([-/])\s*", r" \1 ", s)  # add spaces around slash/dash
    s = re.sub(r"\(\s*", "( ", s)  # add space after parentheses
    s = re.sub(r"\s*\)", " )", s)  # add space before parentheses
    """

    # remove punkt except "'"
    s = remove_symbols(s, keep="'")
    s = re.sub(r"\s*'\s*", "' ", s)  # add space after apostrophe

    s = re.sub(r"æ", "ae", s)  # standarize french chars
    s = re.sub(r"œ", "oe", s)  # standarize french chars
    s = re.sub(r"\s+", " ", s).strip()  # remove extra whitespace
    return s


def word_tokenize(s):
    # return nltk.word_tokenize(s)
    # more efficient
    return word_token_pattern.findall(s)


def stem_word(word):
    return stemmer.stem(word)


def preprocess_and_tokenize(s):
    # normalize the text
    s = normalize_text(s)
    # tokenize
    # return word_tokenize(s)
    # tokenize, remove stopwords
    # return [w for w in word_tokenize(s) if w not in stopwords]
    # tokenize, remove stopwords, and stem
    return " ".join(stem_word(w) for w in word_tokenize(s) if w not in stopwords)

In [6]:
# fmt: off
dataset = dataset.map(
    # lambda x: {"fasttext_line": " ".join("__label__" + l for l in x["labels"]) + " " + x["text"]},
    lambda x: {"fasttext_line": " ".join("__label__" + l for l in x["labels"]) + " " + preprocess_and_tokenize(x["text"])},
    remove_columns=dataset["train"].column_names,
    num_proc=64,
)
# fmt: on
dataset

Map (num_proc=64): 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 14903.39 examples/s]
Map (num_proc=64): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 8083.25 examples/s]
Map (num_proc=64): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2727.76 examples/s]
Map (num_proc=64): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2396.21 examples/s]
Map (num_proc=64): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2432.23 examples/s]
Map (num_proc=64): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3130.92 examples/s]


DatasetDict({
    train: Dataset({
        features: ['fasttext_line'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['fasttext_line'],
        num_rows: 4000
    })
    test_synthetic: Dataset({
        features: ['fasttext_line'],
        num_rows: 1000
    })
    test_synthetic_head: Dataset({
        features: ['fasttext_line'],
        num_rows: 1000
    })
    test_synthetic_medium: Dataset({
        features: ['fasttext_line'],
        num_rows: 1000
    })
    test_synthetic_tail: Dataset({
        features: ['fasttext_line'],
        num_rows: 1000
    })
})

In [7]:
# save to fasttext format

# data_dir = "tmp_data/quaero"
# data_dir = "tmp_data/quaero_upsampled65"
data_dir = "tmp_data/synthetic_v1"

os.makedirs(data_dir, exist_ok=True)
for name, ds in dataset.items():
    output_file = f"{data_dir}/{name}.txt"
    # ds.to_json(output_file, orient="records", lines=True, force_ascii=False)
    with open(output_file, "w", encoding="utf-8") as f:
        for sample in ds:
            f.write(sample["fasttext_line"] + "\n")

## Custom tokenizer

In [None]:
data_dir = "tmp_data"

In [None]:
FASTTEXT_LABEL = "__label__"


def create_text_file(input_path: str, output_path: str, encoding: str = "utf-8"):
    with open(input_path, encoding=encoding) as f_in, open(
        output_path, "w", encoding=encoding
    ) as f_out:

        for line in f_in:
            try:
                tokens = []
                for token in line.split(" "):
                    if FASTTEXT_LABEL not in token:
                        tokens.append(token)

                text = " ".join(tokens)
            except ValueError as e:
                continue

            f_out.write(text)


# get pure text
create_text_file(f"{data_dir}/train.txt", f"{data_dir}/train_text.txt")

In [None]:
# train spm

import sentencepiece as spm

input_file = f"{data_dir}/train_text.txt"
model_name = f"{data_dir}/vocab/spm_unigram"

os.makedirs(os.path.dirname(model_name), exist_ok=True)

"""
!spm_train \
    --input={input_file} \
    --model_prefix={model_name} \
    --vocab_size=8000 \
    --character_coverage=0.9995 \
    --model_type=unigram
"""

spm.SentencePieceTrainer.train(
    input=input_file,
    model_prefix=model_name,
    vocab_size=2000,
    character_coverage=0.9995,
    model_type="unigram",
    # model_type="bpe",
)

In [None]:
# load trained spm

model_file = f"{model_name}.model"
sp = spm.SentencePieceProcessor(model_file=model_file)


def tokenize_text(s):
    return " ".join(sp.encode(s, out_type=str))


# test
# s = "premi cas mondial autoréimplant membr associ trait immunosuppresseur fk 506 tacrolimus rapport préliminair 18 mois"
# sp.encode(s, out_type=str)
# tokenize_text(s)

In [None]:
def create_tokenized_file(input_path: str, output_path: str, encoding: str = "utf-8"):
    with open(input_path, encoding=encoding) as f_in, open(
        output_path, "w", encoding=encoding
    ) as f_out:

        for line in f_in:
            try:
                # the labels remains untouched during the preprocessing step as its
                # already in a format that fasttext can consume
                tokens = []
                labels = []
                for token in line.split(" "):
                    if FASTTEXT_LABEL in token:
                        labels.append(token)
                    else:
                        tokens.append(token)

                text = " ".join(tokens)
                label = " ".join(labels)
            except ValueError as e:
                continue

            tokenized_text = tokenize_text(text)
            new_line = label + " " + tokenized_text
            f_out.write(new_line)
            f_out.write("\n")


# create new data files with custom tokenization
# create_tokenized_file(f"{data_dir}/train.txt", f"{data_dir}/train_tokenized.txt")
# create_tokenized_file(f"{data_dir}/valid.txt", f"{data_dir}/valid_tokenized.txt")
# create_tokenized_file(f"{data_dir}/test.txt", f"{data_dir}/test_tokenized.txt")
# create_tokenized_file(f"{data_dir}/train_synth.txt", f"{data_dir}/train_synth_tokenized.txt")

for p in Path(data_dir).glob("*.txt"):
    if "train_text" in p.as_posix():
        continue
    create_tokenized_file(p.as_posix(), f'{p.with_suffix("")}_tokenized.txt')

## Train

In [8]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer


def binarize_label(y_true, y_pred):
    mlb = MultiLabelBinarizer()
    mlb.fit(y_true + y_pred)
    y_true_encoded = mlb.transform(y_true)
    y_pred_encoded = mlb.transform(y_pred)
    return y_true_encoded, y_pred_encoded


# fmt: off
def evaluate(y, preds, average="micro", verbose=True):
    """evaluate on all metrics"""
    precision, recall, f1, _ = precision_recall_fscore_support(y, preds, average=average, zero_division=1)
    auc_score = roc_auc_score(y, preds, average=average)

    if verbose:
        print(f"precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}, auc_score: {auc_score:.4f}")

    return {"precision": precision, "recall": recall, "f1": f1, "auc_score": auc_score}
# fmt: on


def predict_evaluate(model, test_file, k=1, threshold=0.0, verbose=True):

    def parse_fasttext_line(line):
        labels, tokens = [], []
        for token in line.strip().split(" "):
            if model.label not in token:
                tokens.append(token)
            else:
                labels.append(token)
        text = " ".join(tokens)
        return text, labels

    y_true, y_pred = [], []
    with open(test_file, "r") as f:
        for line in f:
            text, labels = parse_fasttext_line(line)
            prediction, _ = model.predict(text, k=k, threshold=threshold)
            y_pred.append(prediction)
            y_true.append(labels)

    y_true_encoded, y_pred_encoded = binarize_label(y_true, y_pred)

    return evaluate(y_true_encoded, y_pred_encoded, verbose=verbose)

In [9]:
# how this handle labels in test but not in train set?
"""
def predict_evaluate(model, test_file, k=1, threshold=0.0, verbose=True):
    _, precision, recall = model.test(valid_file, k=k, threshold=threshold)
    f1 = 2 * (precision * recall) / (precision + recall)
    if verbose:
        print(f"precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}")
    return {"precision": precision, "recall": recall, "f1": f1}
"""


def train_predict_evaluate(
    train_file, test_file, params=None, k=1, threshold=0.0, verbose=True
):
    train_params = params or {}
    m = fasttext.train_supervised(
        train_file,
        # lr=params['lr'],
        # epoch=params['epoch'],
        # wordNgrams=params['wordNgrams'],
        # dim=params['dim'],
        # minCount=params['minCount'],
        **train_params,
    )

    return m, predict_evaluate(m, test_file, k=k, threshold=threshold, verbose=verbose)

In [10]:
# data_dir = "tmp_data/quaero_upsampled65"

# fmt: off
train_file, valid_file, test_file = f"{data_dir}/train.txt", f"{data_dir}/valid.txt", f"{data_dir}/test.txt"
# train_file, valid_file, test_file = f"{data_dir}/train_tokenized.txt", f"{data_dir}/valid_tokenized.txt", f"{data_dir}/test_tokenized.txt"
# fmt: on

test_files = [p.as_posix() for p in Path(data_dir).glob("test_*.txt")]

train_synth_file = f"{data_dir}/train_synth.txt"
# train_synth_file = f"{data_dir}/train_synth_tokenized.txt"

tmp_train_file = f"{data_dir}/tmp_train.txt"
# tmp_train_file = f"{data_dir}/tmp_train_tokenized.txt"

In [None]:
# train fasttext classifier

model = fasttext.train_supervised(
    train_file,
    lr=1,
    epoch=500,
    wordNgrams=5,
    minn=3,
    maxn=6,
    bucket=200_000,
    # bucket=10_000,
    dim=1_000,
    loss="ova",
)

predict_evaluate(model, valid_file)

In [12]:
FASTTEXT_LABEL = "__label__"


def read_labels(input_path: str, encoding: str = "utf-8"):
    all_labels = []
    with open(input_path, encoding=encoding) as f_in:
        for line in f_in:
            all_labels.append(
                [token for token in line.split(" ") if FASTTEXT_LABEL in token]
            )
    return all_labels


all_valid_labels = read_labels(valid_file)
avg_num_of_labels_per_sample_valid = int(np.mean([len(l) for l in all_valid_labels]))
avg_num_of_labels_per_sample_valid

2

In [13]:
# fasttext autotune

model = fasttext.train_supervised(
    train_file,
    loss="ova",  # multi-label
    autotuneValidationFile=valid_file,
    autotuneDuration=300,  # 600,
    autotuneMetric="f1",
    autotunePredictions=avg_num_of_labels_per_sample_valid,
)

print(model.__dict__)

# rerun on validation
predict_evaluate(model, valid_file)

Progress: 100.0% Trials:   29 Best score:  0.349433 ETA:   0h 0m 0s
Training again with best arguments
Read 1M words
Number of words:  8710
Number of labels: 933
Progress: 100.0% words/sec/thread:  274824 lr:  0.000000 avg.loss:  5.071896 ETA:   0h 0m 0s


{'f': <fasttext_pybind.fasttext object at 0x7f2afba729b0>, '_words': None, '_labels': None, 'lr': 0.4165878098673522, 'dim': 69, 'ws': 5, 'epoch': 100, 'minCount': 1, 'minCountLabel': 0, 'minn': 0, 'maxn': 0, 'neg': 5, 'wordNgrams': 2, 'loss': <loss_name.ova: 4>, 'bucket': 1762785, 'thread': 127, 'lrUpdateRate': 100, 't': 0.0001, 'label': '__label__', 'verbose': 2, 'pretrainedVectors': ''}
precision: 0.5517, recall: 0.1852, f1: 0.2773, auc_score: 0.5923


{'precision': 0.55175,
 'recall': 0.18518207752978688,
 'f1': 0.27729614273149894,
 'auc_score': 0.5923386515733815}

In [14]:
# grid search binarization threshold on validation set

perf_by_thr = []
for thr in np.arange(0.01, 1, 0.01):
    r = predict_evaluate(model, valid_file, threshold=thr, verbose=False)
    perf_by_thr.append({"threshold": thr, **r})

df_perf_by_thr = pd.DataFrame(perf_by_thr)
# sort by f1
df_perf_by_thr = df_perf_by_thr.sort_values("f1", ascending=False)
df_perf_by_thr.head()

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Unnamed: 0,threshold,precision,recall,f1,auc_score
0,0.01,0.583018,0.180903,0.276127,0.590233
1,0.02,0.599039,0.177882,0.274309,0.58874
2,0.03,0.610981,0.175533,0.272715,0.587577
3,0.04,0.620066,0.17268,0.270132,0.58616
4,0.05,0.629286,0.170918,0.268822,0.585288


In [15]:
best_thr = df_perf_by_thr.iloc[0]["threshold"]
print(f"best threshold: {best_thr}")

# print("perf on test set --> ", end="")
# predict_evaluate(model, test_file, threshold=best_thr)

for test_file in test_files:
    print(f"perf on {test_file} --> ", end="")
    predict_evaluate(model, test_file, threshold=best_thr)

best threshold: 0.01
perf on tmp_data/synthetic_v1/test_synthetic.txt --> precision: 0.8066, recall: 0.2555, f1: 0.3881, auc_score: 0.6276
perf on tmp_data/synthetic_v1/test_synthetic_head.txt --> precision: 0.8147, recall: 0.2615, f1: 0.3959, auc_score: 0.6297
perf on tmp_data/synthetic_v1/test_synthetic_medium.txt --> precision: 0.4542, recall: 0.1429, f1: 0.2174, auc_score: 0.5709
perf on tmp_data/synthetic_v1/test_synthetic_tail.txt --> precision: 0.2192, recall: 0.0615, f1: 0.0960, auc_score: 0.5302


In [None]:
# grid search k (top-k) on validation set

perf_by_k = []
for k in range(1, 11):
    r = predict_evaluate(model, valid_file, k=k, verbose=False)
    perf_by_k.append({"k": k, **r})

df_perf_by_k = pd.DataFrame(perf_by_k)
# sort by f1
df_perf_by_k = df_perf_by_k.sort_values("f1", ascending=False)
df_perf_by_k.head()

In [None]:
best_k = int(df_perf_by_k.iloc[0]["k"])
print(f"best k: {best_k}")

print("perf on test set --> ", end="")
predict_evaluate(model, test_file, k=best_k)

In [None]:
def read_file(input_file):
    with open(input_file, "r") as f:
        return [line.strip() for line in f]


def append_file(file_a, file_b, n=None):
    with open(file_a, "a") as fo, open(file_b, "r") as fi:
        for i, line in enumerate(fi):
            if n is not None and i >= n:
                break
            fo.write(line)

In [None]:
# grid search ratios of original/synthetic data

perf_by_ratio = []

num_train = len(read_file(train_file))
num_train_synth = len(read_file(train_synth_file))

multipliers = [0, 0.5] + list(range(1, math.ceil(num_train_synth / num_train) + 1))
# multipliers = [0, 0.5]
for multiplier in multipliers:
    print("\n\n" + f"Multiplier: {multiplier}")
    print("=" * 50 + "\n\n")

    n = min(num_train_synth, int(multiplier * num_train))

    # copy train file
    os.makedirs(os.path.dirname(tmp_train_file), exist_ok=True)
    shutil.copy2(train_file, tmp_train_file)
    # append some synthetic train data to train
    append_file(tmp_train_file, train_synth_file, n)

    model = fasttext.train_supervised(
        tmp_train_file,
        autotuneValidationFile=valid_file,
        autotuneDuration=60,  # 300, 600
        autotuneMetric="f1",
    )

    r = predict_evaluate(model, valid_file)
    perf_by_ratio.append({"multiplier": multiplier, **r, "params": model.__dict__})

df_perf_by_ratio = pd.DataFrame(perf_by_ratio)
df_perf_by_ratio = df_perf_by_ratio.sort_values("f1", ascending=False)
df_perf_by_ratio.head()

In [None]:
os.makedirs(os.path.dirname(tmp_train_file), exist_ok=True)
shutil.copy2(train_file, tmp_train_file)
# append some synthetic train data to train
append_file(tmp_train_file, train_synth_file)

In [None]:
model = fasttext.train_supervised(
    tmp_train_file,
    autotuneValidationFile=valid_file,
    autotuneDuration=600,  # 300, 600
    autotuneMetric="f1",
)

r = predict_evaluate(model, valid_file)

In [None]:
# grid search

param_grid = {
    "lr": [0.1, 0.05, 0.01],
    "epoch": [5, 10, 20],
    "wordNgrams": [1, 2, 3],
    "dim": [100, 200, 300],
    "minCount": [1, 5, 10],
}


def grid_search(train_file, test_file, param_grid):
    best_f1 = 0
    best_params = None
    best_model = None

    param_combinations = list(itertools.product(*param_grid.values()))
    total_combinations = len(param_combinations)

    for i, combination in enumerate(param_combinations, 1):
        params = dict(zip(param_grid.keys(), combination))
        print(f"Evaluating combination {i}/{total_combinations}: {params}")

        model, metrics = train_predict_evaluate(train_file, test_file, params)

        # print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            best_params = params
            best_model = model

    return best_model, best_params, best_f1


best_model, best_params, best_f1 = grid_search(train_file, valid_file, param_grid)

print(f"Best parameters: {best_params}")
print(f"Best F1 score: {best_f1:.4f}")

## Save

In [17]:
saved_model_path = "/home/bhuang/icd_10/outputs/fasttext/synthetic/model_synthetic.bin"

os.makedirs(os.path.dirname(saved_model_path), exist_ok=True)
model.save_model(saved_model_path)

In [18]:
# reload
model_path = saved_model_path

model = fasttext.load_model(model_path)

In [26]:
text = """### Discharge Summary

**Patient: [Nom du patient]**
**Date de naissance: [Date de naissance]**
**Numéro de dossier: [Numéro de dossier]**
**Date d'admission: [Date d'admission]**
**Date de sortie: [Date de sortie]**

**Raison de l'admission:**
Le patient a été admis pour une évaluation et une prise en charge de symptômes récurrents d'infections respiratoires et cutanées. Le patient présente une histoire médicale complexe marquée par une susceptibilité accrue aux infections, suggérant une possible déficience immunitaire.

**Historique médical:**
Le patient a une longue histoire d'infections récurrentes, y compris des pneumonies et des infections cutanées. Les antécédents familiaux révèlent des cas similaires, ce qui renforce l'hypothèse d'une déficience immunitaire héréditaire.

**Évaluation clinique:**
Lors de l'admission, le patient présentait des symptômes de pneumonie, y compris une toux productive, une fièvre élevée et des douleurs thoraciques. L'examen physique a révélé des râles crépitants bilatéraux. Une éruption cutanée était également présente sur les bras et le dos, suggérant une infection bactérienne.

**Résultats des examens:**
Les analyses de laboratoire ont montré une diminution des taux d'immunoglobulines, en particulier les IgG, IgA et IgM. Les tests de fonction immunitaire ont également révélé une réponse anormale aux vaccins, indiquant une déficience dans la production d'anticorps.

**Traitement et gestion:**
Le patient a été traité avec des antibiotiques à large spectre pour les infections respiratoires et cutanées. Des immunoglobulines intraveineuses (IVIG) ont été initiées pour compenser la déficience en anticorps. Une consultation en immunologie a été demandée pour une évaluation plus approfondie et un plan de gestion à long terme.

**Recommandations à la sortie:**
Le patient doit continuer les traitements par IVIG selon le calendrier prescrit. Un suivi régulier avec un immunologue est nécessaire pour surveiller les taux d'immunoglobulines et ajuster le traitement si nécessaire. Le patient et sa famille ont été informés de l'importance de la vaccination et des mesures d'hygiène pour prévenir les infections.

**Plan de suivi:**
Un rendez-vous de suivi est prévu dans deux semaines avec le médecin traitant et l'immunologue. Des analyses de laboratoire seront répétées pour évaluer l'efficacité du traitement et ajuster les doses de IVIG si nécessaire.

**Signature du médecin:**
[Nom du médecin]
[Titre du médecin]
[Date]"""

text = re.sub("\n", "", text)
model.predict(text, k=3) #, threshold=)

(('__label__P55', '__label__Z31', '__label__F41'),
 array([0.64423513, 0.60019839, 0.56986266]))