In [1]:
import os

os.environ["HF_HOME"] = "/projects/bhuang/.cache/huggingface"
# os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
# disable warning of test label not in train
# UserWarning: Label not 1052 is present in all training examples.
import warnings

# Removes warnings in the current job
# warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning)

# Removes warnings in the spawned jobs
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
import math

import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [None]:
# drbenchmark_quaero

data_files = {
    "train": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-train-cls-mistral_large_instruct_2407-processed.jsonl",
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-train-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
    "valid": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-validation-cls-mistral_large_instruct_2407-processed.jsonl",
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-validation-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
    # "test": [
    #     "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-test-cls-mistral_large_instruct_2407-processed.jsonl",
    #     "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-test-cls-mistral_large_instruct_2407-processed.jsonl",
    # ],
    "test_quaero_medline": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-medline-test-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
    "test_quaero_emea": [
        "/home/bhuang/icd_10/data/quaero_icd10_by_category_resplitted/drbenchmark_quaero-emea-test-cls-mistral_large_instruct_2407-processed.jsonl",
    ],
}

In [4]:
# synthetic

data_files = {
    "train": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-train-10k.jsonl",
    ],
    "valid": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-head-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-medium-processed-validation.jsonl",
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-tail-processed-validation.jsonl",
    ],
    "test_synthetic": [
        "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-test.jsonl",
    ],
    "test_synthetic_head": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-head-processed-test.jsonl",
    ],
    "test_synthetic_medium": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-medium-processed-test.jsonl",
    ],
    "test_synthetic_tail": [
        "/home/bhuang/icd_10/data/synthetic_test/synthetic-tail-processed-test.jsonl",
    ],
}

In [5]:
dataset = load_dataset("json", data_files=data_files)
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 10000
    })
    valid: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 4000
    })
    test_synthetic: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_head: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_medium: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
    test_synthetic_tail: Dataset({
        features: ['labels', 'text', 'has_diso'],
        num_rows: 1000
    })
})

In [6]:
# x_train, y_train = dataset["train"]["text"], dataset["train"]["labels"]
# x_valid, y_valid = dataset["valid"]["text"], dataset["valid"]["labels"]
# x_test, y_test = dataset["test"]["text"], dataset["test"]["labels"]

x, y = {}, {}
for name, ds in dataset.items():
    x[name] = ds["text"]
    y[name] = ds["labels"]

In [None]:
# optional: use extra synthetic data
synth_data_files = [
    # "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-train.jsonl",
    "/home/bhuang/icd_10/data/synthetic/synthetic-mistral_large_instruct_2407-240909-processed-train-10k.jsonl"
]

dataset_synth = load_dataset("json", data_files=synth_data_files, split="train")
dataset_synth

In [None]:
x_train_synth, y_train_synth = dataset_synth["text"], dataset_synth["labels"]

## Transform data

### Binarize label

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# prepare labels

# include valid and test in overall classes
# y = y_train + y_valid + y_test
y_all = sum(y.values(), [])

mlb = MultiLabelBinarizer()
mlb.fit(y_all)

classes = mlb.classes_
num_classes = len(classes)
num_classes

933

In [8]:
# transform labels
# y_train_encoded = mlb.transform(y_train)
# y_valid_encoded = mlb.transform(y_valid)
# y_test_encoded = mlb.transform(y_test)

y_encoded = {k: mlb.transform(v) for k, v in y.items()}

In [None]:
y_train_synth_encoded = mlb.transform(y_train_synth)

In [None]:
# Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported
#!pip install imbalanced-learn
# from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=9000)
# x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train_encoded)

### Normalization

In [9]:
import re
import unicodedata

# normalize text to get more dense features


# adapted to optionally keep selected symbols
def remove_symbols(s: str, keep: str = ""):
    """
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    """
    # fmt: off
    return "".join(
        c
        if c in keep
        else " "
        if unicodedata.category(c)[0] in "MSP"
        else c
        for c in unicodedata.normalize("NFKC", s)
    )
    # fmt: on


def normalize_text(s):
    s = s.lower()  # lowercase

    # normalize punkt
    s = unicodedata.normalize("NFKD", s)  # normalize unicode chars
    s = re.sub(r"[´′’ʼ‘ʻ`]", "'", s)  # standardize quotes and apostrophes
    s = re.sub(r"[−‐–—]", "-", s)  # standardize hyphens and dashes
    s = re.sub(r"\s*'\s*", "' ", s)  # add space after apostrophe
    s = re.sub(r"\s*([,.:;!?])", r" \1", s)  # add space before comma/period
    s = re.sub(r"\s*([-/])\s*", r" \1 ", s)  # add spaces around slash/dash
    s = re.sub(r"\(\s*", "( ", s)  # add space after parentheses
    s = re.sub(r"\s*\)", " )", s)  # add space before parentheses

    # remove punkt except "'"
    """
    s = remove_symbols(s, keep="'")
    s = re.sub(r"\s*'\s*", "' ", s)  # add space after apostrophe
    """

    s = re.sub(r"æ", "ae", s)  # standarize french chars
    s = re.sub(r"œ", "oe", s)  # standarize french chars
    s = re.sub(r"\s+", " ", s).strip()  # remove extra whitespace
    return s


# x_train_norm = list(map(normalize_text, x_train))
# x_valid_norm = list(map(normalize_text, x_valid))
# x_test_norm = list(map(normalize_text, x_test))

### Tokenization

In [10]:
# default word analyzer used in TfidfVectorizer
word_token_pattern = re.compile(r"(?u)\b\w\w+\b")


def word_tokenize(s):
    # return nltk.word_tokenize(s)
    # more efficient
    return word_token_pattern.findall(s)

### Stopwords

In [None]:
#!pip install -U nltk
# import nltk
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("punkt_tab")

In [11]:
from nltk.corpus import stopwords

# stopwords_list = stopwords.words("english") + stopwords.words("french")
stopwords = set(stopwords.words("french"))

### Stemming

In [12]:
from nltk.stem.snowball import SnowballStemmer

# init SnowballStemmer
stemmer = SnowballStemmer("french")


def stem_word(word):
    return stemmer.stem(word)

In [None]:
# prepare input text at once

# tf-idf Vectorization
# tfidf = TfidfVectorizer(use_idf=True, max_features=5000)
# X_train_tfidf = tfidf.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test)

## Train and evaluate

### Evaluate helper

In [13]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score


# fmt: off
def evaluate(y, preds, average="micro", verbose=True):
    """evaluate on all metrics"""
    precision, recall, f1, _ = precision_recall_fscore_support(y, preds, average=average, zero_division=1)
    # precision, recall, f1, _ = precision_recall_fscore_support(y, preds, average=average, labels=classes, zero_division=1)
    auc_score = roc_auc_score(y, preds, average=average)

    if verbose:
        print(f"precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}, auc_score: {auc_score:.4f}")

    # print(classification_report(y, preds, zero_division=1, digits=4))

    """
    conf_mat = confusion_matrix(y, preds)
    conf_mat_df = pd.DataFrame(conf_mat, index=classes, columns=classes)
    # print(conf_mat_df)
    
    plt.figure(figsize=(15, 10))
    sns.heatmap(conf_mat_df, annot=True, vmin=0, vmax=conf_mat.max(), fmt='d', cmap="YlGnBu")
    plt.yticks(rotation=0)
    plt.xticks(rotation=45)
    """

    return {"precision": precision, "recall": recall, "f1": f1, "auc_score": auc_score}
# fmt: on

### Train helper

In [14]:
def predict_evaluate(x, y, clf, threshold=None, verbose=True):
    """predict then evaluate"""
    if threshold is None:
        # predict class labels
        preds = clf.predict(x)
        return evaluate(y, preds, verbose=verbose)
    else:
        # predict proba estimates for each label
        preds = clf.predict_proba(x)
        preds = np.where(preds >= threshold, 1, 0)
        return evaluate(y, preds, verbose=verbose)


# fmt: off
# def train_predict_evaluate(clf, x_tr, y_tr, x_val=None, y_val=None, threshold=None):
# def train_predict_evaluate(clf, x_tr=x_train, y_tr=y_train_encoded, x_val=None, y_val=None, threshold=None):
# def train_predict_evaluate(clf, x_tr=x_train, y_tr=y_train_encoded, x_val=x_valid, y_val=y_valid_encoded, threshold=None, verbose=True):
def train_predict_evaluate(clf, x_tr=x["train"], y_tr=y_encoded["train"], x_val=x["valid"], y_val=y_encoded["valid"], threshold=None, verbose=True):
    """train, then predict and evaluate"""
    # train
    clf.fit(x_tr, y_tr)

    result = {}
    print("Evaluation on training data:")
    r = predict_evaluate(x_tr, y_tr, clf, threshold=threshold, verbose=verbose)
    result.update({f"train_{k}": v for k, v in r.items()})
    if x_val is not None and y_val is not None:
        print("\nEvaluation on valid data:")
        r = predict_evaluate(x_val, y_val, clf, threshold=threshold, verbose=verbose)
        result.update({f"valid_{k}": v for k, v in r.items()})
    return result
# fmt: on

### Train

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline


def preprocess_and_tokenize(s):
    # normalize the text
    s = normalize_text(s)
    # tokenize
    # return word_tokenize(s)
    # tokenize, remove stopwords
    # return [w for w in word_tokenize(s) if w not in stopwords]
    # tokenize, remove stopwords, and stem
    return [stem_word(w) for w in word_tokenize(s) if w not in stopwords]


# lr
tfidf_lr_clf = make_pipeline(
    TfidfVectorizer(
        # strip_accents=None,
        # lowercase=True,
        # stop_words=stopwords,
        analyzer=preprocess_and_tokenize,
        use_idf=True,
    ),
    OneVsRestClassifier(
        LogisticRegression(
            class_weight="balanced",
            solver="liblinear",  # lbfgs, liblinear, saga, sag
            C=0.01,
            # n_jobs=16,
            random_state=10,
        ),
        n_jobs=-1,
    ),
)

In [None]:
from sklearn.svm import LinearSVC

# svm
tfidf_svm_clf = make_pipeline(
    TfidfVectorizer(
        # strip_accents=None,
        # lowercase=True,
        # stop_words=stopwords,
        analyzer=preprocess_and_tokenize,
        use_idf=True,
    ),
    OneVsRestClassifier(
        LinearSVC(
            class_weight="balanced",
            C=0.01,
            # n_jobs=16,
            random_state=10,
        ),
        n_jobs=-1,
    ),
)

In [20]:
# simple train

clf = tfidf_lr_clf
# clf = tfidf_svm_clf

train_predict_evaluate(clf)

Evaluation on training data:
precision: 0.2523, recall: 0.9791, f1: 0.4012, auc_score: 0.9848

Evaluation on valid data:
precision: 0.1852, recall: 0.6834, f1: 0.2914, auc_score: 0.8369


{'train_precision': 0.25232178448751114,
 'train_recall': 0.9791337546076883,
 'train_f1': 0.40124351772552247,
 'train_auc_score': 0.9848271654705815,
 'valid_precision': 0.18518097489996363,
 'valid_recall': 0.6834200369189461,
 'valid_f1': 0.29140281206396906,
 'valid_auc_score': 0.8368930547473828}

In [28]:
# grid search binarization threshold on validation set

clf = best_model

perf_by_thr = []
for thr in tqdm(np.arange(0.01, 1, 0.01)):
    r = predict_evaluate(
        x["valid"], y_encoded["valid"], clf, threshold=thr, verbose=False
    )
    perf_by_thr.append({"threshold": thr, **r})

df_perf_by_thr = pd.DataFrame(perf_by_thr)
# sort by f1
df_perf_by_thr = df_perf_by_thr.sort_values("f1", ascending=False)
df_perf_by_thr.head()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [29:57<00:00, 18.16s/it]


Unnamed: 0,threshold,precision,recall,f1,auc_score
30,0.31,0.599828,0.52534,0.560118,0.762108
38,0.39,0.63583,0.5,0.559793,0.749541
37,0.38,0.63092,0.503021,0.559757,0.751039
33,0.34,0.612736,0.515103,0.559694,0.75703
40,0.41,0.64478,0.494378,0.55965,0.746753


In [29]:
# eval on test set using determined threshold
best_threshold = df_perf_by_thr.iloc[0]["threshold"]
print(f"best thr: {best_threshold}")

# print("perf on test set --> ", end="")
# predict_evaluate(x_test, y_test_encoded, model, threshold=best_threshold)

result = {}
for split in x:
    if split.startswith("test"):
        print(f"perf on test set {split} --> ", end="")
        r = predict_evaluate(
            x[split], y_encoded[split], clf, threshold=best_threshold
        )
        result.update({f"{split}_{k}": v for k, v in r.items()})

# save result
df_result = pd.DataFrame([result])
# df_result.to_json("tmp_result/tmp.json", orient="records", lines=True, force_ascii=False)
df_result.to_csv("tmp_result/tmp.csv")

best thr: 0.31
perf on test set test_synthetic --> precision: 0.7005, recall: 0.7243, f1: 0.7122, auc_score: 0.8616
perf on test set test_synthetic_head --> precision: 0.6580, recall: 0.6958, f1: 0.6764, auc_score: 0.8473
perf on test set test_synthetic_medium --> precision: 0.5553, recall: 0.4620, f1: 0.5044, auc_score: 0.7304
perf on test set test_synthetic_tail --> precision: 0.4051, recall: 0.2337, f1: 0.2964, auc_score: 0.6163


In [None]:
# grid search ratios of original/synthetic data

# initial param
initial_params = clf.get_params()

perf_by_ratio = []

multipliers = [0, 0.5] + list(range(1, math.ceil(len(x_train_synth) / len(x["train"])) + 1))
for multiplier in tqdm(multipliers):
    print("\n\n" + f"Multiplier: {multiplier}")
    print("=" * 50 + "\n\n")

    # get new data
    n = min(len(x_train_synth), int(multiplier * len(x["train"])))
    x_train_s, y_train_s = x_train_synth[:n], y_train_synth_encoded[:n]

    # reinit model param
    tfidf_lr_clf.set_params(**initial_params)

    r = train_predict_evaluate(
        clf,
        x_tr=x["train"] + x_train_s,
        y_tr=np.concatenate((y_encoded["train"], y_train_s), axis=0),
        x_val=x["valid"],
        y_val=y_encoded["valid"],
        # x_val=x_test,  # test / valid
        # y_val=y_test_encoded,
    )
    perf_by_ratio.append({"multiplier": multiplier, **r})


df_perf_by_ratio = pd.DataFrame(perf_by_ratio)
df_perf_by_ratio = df_perf_by_ratio.sort_values("valid_f1", ascending=False)
df_perf_by_ratio.head()

In [None]:
output_file = "/home/bhuang/icd_10/outputs/tfidf_lr/quaero_result_perf_by_ratio.jsonl"
# df_perf_by_ratio.to_json(output_file, orient="records", lines=True, force_ascii=False)

### Grid search

In [21]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

# grid search on val set instead of run cv within train set
# x = np.concatenate([x_train, x_valid])
# y = np.concatenate([y_train_encoded, y_valid_encoded])
x_cv = np.concatenate([x["train"], x["valid"]])
y_cv = np.concatenate([y_encoded["train"], y_encoded["valid"]])

test_fold = np.concatenate(
    [
        np.full(len(y_encoded["train"]), -1, dtype=np.int8),
        np.zeros(len(y_encoded["valid"]), dtype=np.int8),
    ]
)

# define cv part
cv = PredefinedSplit(test_fold)

In [22]:
# lr

# fmt: off
param_grid = {
    # "tfidfvectorizer__max_features": [None, 1000, 5000, 10000, 20000],
    # "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    # "tfidfvectorizer__max_df": [0.60, 0.65, 0.70, 0.75, 0.80, 0.90, 1],
    "onevsrestclassifier__estimator__solver": ["lbfgs", "liblinear", "newton-cg", "saga", "sag"],
    "onevsrestclassifier__estimator__C": [100, 10, 1.0, 0.1, 0.01],
    "onevsrestclassifier__estimator__penalty": ["l2"],
}
# fmt: on

n_jobs = -1

grid_search = GridSearchCV(
    tfidf_lr_clf,
    param_grid,
    cv=cv,
    scoring="f1_micro",  # "f1_weighted"
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=1,
)

grid_search.fit(x_cv, y_cv)

print(grid_search.best_params_)
print(grid_search.best_score_)
# print(grid_search.cv_results_)

best_model = grid_search.best_estimator_
train_predict_evaluate(best_model)

Fitting 1 folds for each of 25 candidates, totalling 25 fits
{'onevsrestclassifier__estimator__C': 100, 'onevsrestclassifier__estimator__penalty': 'l2', 'onevsrestclassifier__estimator__solver': 'liblinear'}
0.5528148000795703
Evaluation on training data:
precision: 0.9993, recall: 1.0000, f1: 0.9997, auc_score: 1.0000

Evaluation on valid data:
precision: 0.6786, recall: 0.4664, f1: 0.5528, auc_score: 0.7328


{'train_precision': 0.9993421918168662,
 'train_recall': 1.0,
 'train_f1': 0.9996709876949398,
 'train_auc_score': 0.9999989246867828,
 'valid_precision': 0.6786324786324787,
 'valid_recall': 0.4663534150025172,
 'valid_f1': 0.5528148000795703,
 'valid_auc_score': 0.7328229518582379}

In [None]:
# svm

# fmt: off
param_grid = {
    # "tfidfvectorizer__max_features": [None, 1000, 5000, 10000, 20000],
    # "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    # "tfidfvectorizer__max_df": [0.60, 0.65, 0.70, 0.75, 0.80, 0.90, 1],
    # "onevsrestclassifier__estimator__solver": ["lbfgs", "liblinear", "newton-cg", "saga", "sag"],
    "onevsrestclassifier__estimator__C": [100, 10, 1.0, 0.1, 0.01],
    "onevsrestclassifier__estimator__penalty": ["l2"],
}
# fmt: on

n_jobs = -1

grid_search = GridSearchCV(
    tfidf_lr_svm,
    param_grid,
    cv=cv,
    scoring="f1_micro",  # "f1_weighted"
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=1,
)

grid_search.fit(x_cv, y_cv)

print(grid_search.best_params_)
print(grid_search.best_score_)
# print(grid_search.cv_results_)

best_model = grid_search.best_estimator_
train_predict_evaluate(best_model)

## Save

In [31]:
import pickle

saved_model_path = "/home/bhuang/icd_10/outputs/tfidf_lr/synthetic/model_tfidf_lr.pkl"

os.makedirs(os.path.dirname(saved_model_path), exist_ok=True)
with open(saved_model_path, "wb") as fo:
    pickle.dump(clf, fo)

In [32]:
# reload
model_path = saved_model_path

with open(model_path, "rb") as f:
    loaded_model = pickle.load(f)

## Predict

In [36]:
text = """### Discharge Summary

**Patient: [Nom du patient]**
**Date de naissance: [Date de naissance]**
**Numéro de dossier: [Numéro de dossier]**
**Date d'admission: [Date d'admission]**
**Date de sortie: [Date de sortie]**

**Raison de l'admission:**
Le patient a été admis pour une évaluation et une prise en charge de symptômes récurrents d'infections respiratoires et cutanées. Le patient présente une histoire médicale complexe marquée par une susceptibilité accrue aux infections, suggérant une possible déficience immunitaire.

**Historique médical:**
Le patient a une longue histoire d'infections récurrentes, y compris des pneumonies et des infections cutanées. Les antécédents familiaux révèlent des cas similaires, ce qui renforce l'hypothèse d'une déficience immunitaire héréditaire.

**Évaluation clinique:**
Lors de l'admission, le patient présentait des symptômes de pneumonie, y compris une toux productive, une fièvre élevée et des douleurs thoraciques. L'examen physique a révélé des râles crépitants bilatéraux. Une éruption cutanée était également présente sur les bras et le dos, suggérant une infection bactérienne.

**Résultats des examens:**
Les analyses de laboratoire ont montré une diminution des taux d'immunoglobulines, en particulier les IgG, IgA et IgM. Les tests de fonction immunitaire ont également révélé une réponse anormale aux vaccins, indiquant une déficience dans la production d'anticorps.

**Traitement et gestion:**
Le patient a été traité avec des antibiotiques à large spectre pour les infections respiratoires et cutanées. Des immunoglobulines intraveineuses (IVIG) ont été initiées pour compenser la déficience en anticorps. Une consultation en immunologie a été demandée pour une évaluation plus approfondie et un plan de gestion à long terme.

**Recommandations à la sortie:**
Le patient doit continuer les traitements par IVIG selon le calendrier prescrit. Un suivi régulier avec un immunologue est nécessaire pour surveiller les taux d'immunoglobulines et ajuster le traitement si nécessaire. Le patient et sa famille ont été informés de l'importance de la vaccination et des mesures d'hygiène pour prévenir les infections.

**Plan de suivi:**
Un rendez-vous de suivi est prévu dans deux semaines avec le médecin traitant et l'immunologue. Des analyses de laboratoire seront répétées pour évaluer l'efficacité du traitement et ajuster les doses de IVIG si nécessaire.

**Signature du médecin:**
[Nom du médecin]
[Titre du médecin]
[Date]"""

loaded_model.predict_proba([text])

array([[2.76256692e-04, 7.80984995e-05, 5.17009845e-05, 2.24112679e-04,
        2.00080363e-04, 8.88702360e-05, 1.51310964e-04, 4.83799483e-06,
        2.87879608e-05, 1.64712768e-04, 1.39005550e-04, 1.02794725e-04,
        1.69949373e-03, 1.54932323e-03, 7.45279109e-04, 1.69176379e-04,
        3.24393985e-03, 1.17241979e-04, 4.66060211e-04, 6.68332741e-04,
        6.37089513e-04, 2.39238340e-04, 3.00119058e-04, 3.59374971e-04,
        4.29008353e-04, 4.06532124e-04, 2.00388149e-04, 3.11414806e-04,
        1.49106030e-03, 1.89561116e-03, 7.46730579e-06, 4.35300637e-04,
        1.39169549e-05, 9.03497926e-05, 1.15189075e-03, 7.08509261e-04,
        1.42294859e-04, 1.31733963e-05, 2.93105489e-04, 7.52742366e-05,
        3.10601473e-03, 3.28513726e-04, 1.01357977e-04, 1.09288343e-05,
        4.96619956e-05, 1.66777119e-06, 6.56802115e-06, 5.35894883e-06,
        2.33121726e-05, 6.03850076e-04, 3.90037873e-05, 5.61263242e-05,
        1.29198861e-05, 3.31508616e-05, 2.42528260e-03, 2.042004