In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import faiss
import numpy as np
from tqdm import tqdm

In [21]:
class TermPreprocessor:
    def __init__(self):
        self.stopwords = stopwords.words('french')
        self.stemmer = FrenchStemmer()

    def preprocess(self, term):
        tokens = term.lower().split()
        tokens = [token for token in tokens if token not in self.stopwords]
        tokens = [self.stemmer.stem(token) for token in tokens]
        return " ".join(tokens)

In [22]:
class SNOMEDDataLoader:
    def __init__(self, folder_path_int, folder_path_fr):
        self.folder_path_int = folder_path_int
        self.folder_path_fr = folder_path_fr

    def load_data(self) -> pd.DataFrame:
        descriptions_int = pd.read_csv(
            f"{self.folder_path_int}sct2_Description_Snapshot-en_INT_20240201.txt",
            sep="\t",
            usecols=["conceptId", "active", "term"],
        )
        descriptions_int = descriptions_int[
            (descriptions_int["active"] == 1)
        ]
        descriptions_int = descriptions_int.rename(
            columns={"conceptId": "Concept_ID", "term": "Synonym_INT"}
        )
        
        descriptions_fr = pd.read_csv(
            f"{self.folder_path_fr}sct2_Description_Snapshot_CommonFrench-Extension_20240331.txt",
            sep="\t",
            usecols=["conceptId", "active", "term"],
        )
        descriptions_fr = descriptions_fr[
            (descriptions_fr["active"] == 1)
        ]
        descriptions_fr = descriptions_fr.rename(
            columns={"conceptId": "Concept_ID", "term": "Synonym_FR"}
        )
        
        merged_data = pd.merge(
            descriptions_int, descriptions_fr, on="Concept_ID", how="left"
        )
        
        merged_data = merged_data.fillna("")
        merged_data = merged_data.groupby(["Concept_ID"], as_index=False).agg(
            {"Synonym_INT": " ".join, "Synonym_FR": " ".join}
        )
        
        return merged_data


In [17]:
class SNOMEDAligner:
    def __init__(self, data: pd.DataFrame, preprocessor: TermPreprocessor):
        self.data = data
        self.preprocessor = preprocessor
        self.vectorizer = TfidfVectorizer()
        self.index = None
        
    def build_index(self, batch_size=10000):
        corpus = []
        for i in tqdm(range(0, len(self.data), batch_size)):
            batch_data = self.data.iloc[i:i+batch_size]
            batch_corpus = batch_data['Synonym_INT'].apply(self.preprocessor.preprocess).tolist() + batch_data['Synonym_FR'].apply(self.preprocessor.preprocess).tolist()
            corpus.extend(batch_corpus)
        
        tfidf_matrix = self.vectorizer.fit_transform(corpus)
        self.index = faiss.IndexFlatIP(tfidf_matrix.shape[1])
        self.index.add(tfidf_matrix.toarray())
        
    def align_term(self, term, top_k=5):
        preprocessed_term = self.preprocessor.preprocess(term)
        term_vector = self.vectorizer.transform([preprocessed_term]).toarray()
        
        distances, indices = self.index.search(term_vector, top_k)
        top_match_concepts = [self.data.iloc[idx]['Concept_ID'] for idx in indices[0]]
        
        if not top_match_concepts:
            return "Aucune correspondance SNOMED CT satisfaisante trouvée."
        return top_match_concepts


In [9]:
globalPath = 'data/ref/'
folder_path_int = globalPath
folder_path_fr = globalPath

In [10]:
data_loader = SNOMEDDataLoader(folder_path_int, folder_path_fr)
data = data_loader.load_data()

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jessy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
preprocessor = TermPreprocessor()

In [18]:
aligner = SNOMEDAligner(data, preprocessor)

In [19]:
aligner.build_index(batch_size=10000)

MemoryError: Unable to allocate 460. GiB for an array with shape (61780130098,) and data type int64

In [None]:
aligned_concepts = aligner.align_term("Paracétamol 300 mg suppositoire")
print(f"Les 5 concepts SNOMED CT les plus proches sont : {aligned_concepts}")