# Data Preprocessing

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# load data
import numpy as np
import pandas as pd

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize NLTK's SnowballStemmer and stopwords for French
stemmer = SnowballStemmer('french')
stop_words = set(stopwords.words('french'))

dictionary = {}


def tokenizeText(text, dictionary):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text using NLTK tokenizer
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming using SnowballStemmer
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Numerize tokens
    numerical_tokens = []
    for token in stemmed_tokens:
        if token not in dictionary:
            # dictionary[token] = torch.tensor(len(dictionary) + 1,dtype=torch.int32)
            dictionary[token] = len(dictionary) + 1
        numerical_tokens.append(dictionary[token])
    
    # Convert numerical tokens and dictionary values to tensors
    # numerical_tokens_tensor = torch.tensor(numerical_tokens)
    numerical_tokens_np = np.array(numerical_tokens)
    # return numerical_tokens_tensor
    return numerical_tokens_np



[nltk_data] Downloading package punkt to C:\Users\Aziz
[nltk_data]     Hlila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aziz
[nltk_data]     Hlila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_full = pd.read_csv("data_cleanOutput.csv")
# data = data.drop_duplicates()

df_full.shape

(320, 23)

# Test with sklean

# KNN

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [6]:
dictionary={}
descriptionsTokenized = df_full['Description'].apply(lambda x: tokenizeText(x, dictionary))
descriptionsTokenized

0           [1, 2, 3, 4, 5, 6, 7, 8, 9]
1      [1, 2, 10, 11, 12, 13, 14, 8, 9]
2      [1, 2, 10, 13, 15, 16, 17, 8, 9]
3              [1, 2, 18, 19, 20, 8, 9]
4      [1, 2, 10, 21, 22, 23, 24, 8, 9]
                     ...               
315       [1, 2, 27, 13, 108, 49, 8, 9]
316             [1, 2, 18, 7, 19, 8, 9]
317       [1, 2, 27, 109, 94, 74, 8, 9]
318             [1, 2, 18, 44, 6, 8, 9]
319           [1, 2, 27, 101, 49, 8, 9]
Name: Description, Length: 320, dtype: object

In [7]:
def pad_sequences(sequence_list, maxlen=None):
    if not maxlen:
        maxlen = max(len(seq) for seq in sequence_list)
    padded_sequences = np.zeros((len(sequence_list), maxlen), dtype=int)
    for i, seq in enumerate(sequence_list):
        if len(seq) > 0:  # Check if the sequence is not empty
            padded_sequences[i, :len(seq)] = seq
    return padded_sequences

descriptionsTokenizedPadded = pad_sequences(descriptionsTokenized,300)
print(descriptionsTokenizedPadded.shape)
df_tokens = pd.DataFrame(data=descriptionsTokenizedPadded,index=df_full.index).rename(columns=str)

(320, 300)


In [8]:
X = df_full.drop(columns=['Ordonnance'])
y = df_full['Ordonnance']
X.shape

(320, 22)

In [13]:


# List of numeric and categorical features
numeric_features = ['tailleCm', 'poidsKg', 'IMC', 'nbGrossesse', 'nbEnfantsVivants',
                    'nbMacrosomies', 'nbAvortements', 'nbMortNes', 'ageMenopause',
                    'alcoolSemaine', 'nbCigaretteParJour', 'Age']
categorical_features = ['groupeSanguin', 'HTA', 'diabete', 'dyslipidemie',
                        'tabacStatus',
                        'drogue',]


# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing to numeric and categorical features
X_preprocessed = preprocessor.fit_transform(X)
print("before ",X.shape,"after ",X_preprocessed.shape)
X_combined = np.hstack((X_preprocessed, descriptionsTokenizedPadded))
print(X_combined.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=13)


before  (320, 22) after  (320, 30)
(320, 330)


In [14]:
# save preproccesser
from joblib import dump, load
pippreprocessor_file = 'preprocessor.joblib'
dump(preprocessor, pippreprocessor_file)


['preprocessor.joblib']

In [13]:
from joblib import dump, load
from sklearn.preprocessing import FunctionTransformer
pipeline_file = 'pipeline.joblib'

def concatenate_features(X):
    return np.hstack((X, descriptionsTokenizedPadded))

preprocessor=load(pippreprocessor_file)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_concatenation', FunctionTransformer(concatenate_features, validate=False))
])

# Fit and transform your data
X_combined = pipeline.fit_transform(X)

# Later, you can load the pipeline from the file
loaded_pipeline = load(pipeline_file)
loaded_pipeline.transform(X)

array([[ 1.44038222,  0.327272  , -0.71714759, ...,  0.        ,
         0.        ,  0.        ],
       [-0.56828561, -1.04350077, -1.18375748, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.77082628, -0.12965226, -0.80198575, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.63691509,  0.96696596,  0.97961565, ...,  0.        ,
         0.        ,  0.        ],
       [-1.23784155, -1.59180988, -1.56552921, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.84211579,  1.69804477,  0.89477749, ...,  0.        ,
         0.        ,  0.        ]])

In [63]:
example = pd.read_csv("validation.csv")
example

Unnamed: 0,tailleCm,poidsKg,groupeSanguin,IMC,adresse,HTA,diabete,dyslipidemie,autresAntecedentsFamiliaux,nbGrossesse,...,ageMenopause,autresAntecedentsGynecoObstetriques,alcoolSemaine,tabacStatus,nbCigaretteParJour,drogue,autreHabitudeToxique,Age,sexe,Description
0,180,75,O+,23.1,123 Rue,non,non,oui,,0.0,...,0.0,,0,non-fumeur,0.0,non,non,42,homme,"""Le patient souffre de maux de tête fréquents..."


In [64]:
pipeline.transform(example)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1 and the array at index 1 has size 320

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(X_test.shape)

Accuracy: 0.40625
(64, 352)


In [38]:
knn.predict_proba(X_test)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
distances, indices = knn.kneighbors(X_test, n_neighbors=10)
# Convert indices to actual predictions
top_k_predictions = np.array(y_train)[indices]


In [40]:
def top_n_accuracy(y_true, y_pred, n=3):
    top_n_correct = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred[:n]:
            top_n_correct += 1
    return top_n_correct / len(y_true)

def mean_reciprocal_rank(y_true, y_pred):
    rr_sum = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred:
            rr_sum += 1 / (pred.tolist().index(true) + 1)
    return rr_sum / len(y_true)

def precision_at_k(y_true, y_pred, k=3):
    precision_sum = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred[:k]:
            precision_sum += 1
    return precision_sum / (k * len(y_true))

def coverage(y_pred, all_items, k=3):
    recommended_items = set()
    for pred in y_pred:
        recommended_items.update(pred[:k])
    return len(recommended_items) / len(all_items)

# Example: Evaluating the top 3 recommendations
top_3_acc = top_n_accuracy(y_test, top_k_predictions, n=7)
mrr = mean_reciprocal_rank(y_test, top_k_predictions)
precision_k = precision_at_k(y_test, top_k_predictions, k=7)
all_possible_items = set(y_train)  # Use the unique items from the training set as all possible items
coverage_score = coverage(top_k_predictions, all_possible_items, k=7)

# Print evaluation metrics
print(f"Top-3 Accuracy: {top_3_acc:.2f}")
print(f"Mean Reciprocal Rank: {mrr:.2f}")
print(f"Coverage at 3: {coverage_score:.2f}")


Top-3 Accuracy: 0.89
Mean Reciprocal Rank: 0.87
Coverage at 3: 0.85


## export model with dict

In [44]:
# import joblib


['knn_model.joblib']

In [16]:
import pickle
with open('tokenizer_dict.pkl', 'wb') as f:
    pickle.dump(dictionary, f)
pickle.dump(knn, 'knn_model.joblib')

### Test load

In [49]:
knn2 = pickle.load("knn_model.joblib")
with open('tokenizer_dict.pkl', 'rb') as f:
    tokenizer_dict = pickle.load(f)

In [48]:
knn2.predict(X_test)

array([' "Prendre un comprimé de lisinopril 10mg une fois par jour."',
       ' "Prendre metformine 500mg deux fois par jour et amlodipine 5mg une fois par jour."',
       ' "Prescrire un antispasmodique à prendre au besoin."',
       ' "Prescrire un bêta-bloquant et une statine à prendre quotidiennement."',
       ' "Prendre fer 80mg une fois par jour."',
       ' "Prescrire un antitussif et recommander une radiographie thoracique."',
       ' "Prescrire un bêta-bloquant et une statine à prendre quotidiennement."',
       ' "Prescrire un triptan à prendre au début des symptômes de la migraine."',
       ' "Prescrire un analgésique et recommander des séances de kinésithérapie."',
       ' "Prescrire un triptan à prendre au début des symptômes de la migraine."',
       ' "Prescrire un diurétique et recommander un régime pauvre en sel."',
       ' "Administrer 10 unités d\'insuline avant chaque repas."',
       ' "Prescrire un antitussif et recommander une radiographie thoracique."',
   

# Decision tree

In [41]:
from sklearn.tree import DecisionTreeClassifier
from collections import Counter


In [42]:
# Train the Decision Tree model on the training data
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict the leaf node for the test data
leaf_nodes = decision_tree.apply(X_test)

# Get the samples that fall into each leaf node in the training data
train_leaf_nodes = decision_tree.apply(X_train)
leaf_node_to_samples = {leaf_node: [] for leaf_node in np.unique(train_leaf_nodes)}
for idx, leaf_node in enumerate(train_leaf_nodes):
    leaf_node_to_samples[leaf_node].append(idx)

# Get top 3 predictions based on majority class in each leaf node
top_k_predictions = []
for leaf_node in leaf_nodes:
    samples = leaf_node_to_samples[leaf_node]
    top_labels = [y_train.iloc[i] for i in samples]
    top_3_labels = [label for label, _ in Counter(top_labels).most_common(3)]
    top_k_predictions.append(top_3_labels)

In [54]:
# Define evaluation functions
def top_n_accuracy(y_true, top_n_preds, n=3):
    correct = 0
    for true, preds in zip(y_true, top_n_preds):
        if true in preds[:n]:
            correct += 1
    return correct / len(y_true)

def mean_reciprocal_rank(y_true, top_n_preds):
    ranks = []
    for true, preds in zip(y_true, top_n_preds):
        if true in preds:
            rank = preds.index(true) + 1
            ranks.append(1 / rank)
        else:
            ranks.append(0)
    return np.mean(ranks)

def coverage(top_n_preds, all_possible_items, k=3):
    recommended_items = set()
    for preds in top_n_preds:
        recommended_items.update(preds[:k])
    return len(recommended_items) / len(all_possible_items)

# Evaluate the recommendations
top_3_acc = top_n_accuracy(y_test, top_k_predictions, n=5)
mrr = mean_reciprocal_rank(y_test, top_k_predictions)
all_possible_items = set(y_train)  # Use the unique items from the training set as all possible items
coverage_score = coverage(top_k_predictions, all_possible_items, k=5)

print(f"Top-3 Accuracy: {top_3_acc*100:.1f}%")
print(f"Mean Reciprocal Rank: {mrr*100:.1f}%")
print(f"Coverage at 3: {coverage_score*100:.1f}%")

NameError: name 'top_k_predictions' is not defined