# Data Preprocessing

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
# load data
import numpy as np
import pandas as pd

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize NLTK's SnowballStemmer and stopwords for French
stemmer = SnowballStemmer('french')
stop_words = set(stopwords.words('french'))

dictionary = {}


def tokenizeText(text, dictionary):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text using NLTK tokenizer
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming using SnowballStemmer
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Numerize tokens
    numerical_tokens = []
    for token in stemmed_tokens:
        if token not in dictionary:
            # dictionary[token] = torch.tensor(len(dictionary) + 1,dtype=torch.int32)
            dictionary[token] = len(dictionary) + 1
        numerical_tokens.append(dictionary[token])
    
    # Convert numerical tokens and dictionary values to tensors
    # numerical_tokens_tensor = torch.tensor(numerical_tokens)
    numerical_tokens_np = np.array(numerical_tokens)
    # return numerical_tokens_tensor
    return numerical_tokens_np



KeyboardInterrupt: 

In [41]:
df_full = pd.read_csv("data_cleanOutput.csv")

df_full.shape

(424, 23)

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



In [43]:
dictionary={}
descriptionsTokenized = df_full['Description'].apply(lambda x: tokenizeText(x, dictionary))
descriptionsTokenized

0           [1, 2, 3, 4, 5, 6, 7, 8, 9]
1      [1, 2, 10, 11, 12, 13, 14, 8, 9]
2      [1, 2, 10, 13, 15, 16, 17, 8, 9]
3              [1, 2, 18, 19, 20, 8, 9]
4      [1, 2, 10, 21, 22, 23, 24, 8, 9]
                     ...               
419       [1, 2, 27, 13, 108, 49, 8, 9]
420             [1, 2, 18, 7, 19, 8, 9]
421       [1, 2, 27, 109, 94, 74, 8, 9]
422             [1, 2, 18, 44, 6, 8, 9]
423           [1, 2, 27, 101, 49, 8, 9]
Name: Description, Length: 424, dtype: object

In [44]:
def pad_sequences(sequence_list, maxlen=None):
    if not maxlen:
        maxlen = max(len(seq) for seq in sequence_list)
    padded_sequences = np.zeros((len(sequence_list), maxlen), dtype=int)
    for i, seq in enumerate(sequence_list):
        if len(seq) > 0:  # Check if the sequence is not empty
            padded_sequences[i, :len(seq)] = seq
    return padded_sequences

descriptionsTokenizedPadded = pad_sequences(descriptionsTokenized,300)
print(descriptionsTokenizedPadded.shape)
df_tokens = pd.DataFrame(data=descriptionsTokenizedPadded,index=df_full.index).rename(columns=str)
df_tokens

(424, 300)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,1,2,3,4,5,6,7,8,9,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,10,11,12,13,14,8,9,0,...,0,0,0,0,0,0,0,0,0,0
2,1,2,10,13,15,16,17,8,9,0,...,0,0,0,0,0,0,0,0,0,0
3,1,2,18,19,20,8,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,10,21,22,23,24,8,9,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419,1,2,27,13,108,49,8,9,0,0,...,0,0,0,0,0,0,0,0,0,0
420,1,2,18,7,19,8,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
421,1,2,27,109,94,74,8,9,0,0,...,0,0,0,0,0,0,0,0,0,0
422,1,2,18,44,6,8,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
X = df_full.drop(columns=['Ordonnance'])
y = df_full['Ordonnance']
X.shape

(424, 22)

In [46]:
X["tailleCm"].value_counts(dropna=False)

tailleCm
160    51
165    47
170    43
175    37
162    35
180    33
168    29
178    24
172    21
158    17
176    16
174    13
182    12
155     9
164     8
173     4
167     4
177     4
185     4
163     3
183     3
169     2
159     2
157     1
161     1
166     1
Name: count, dtype: int64

In [47]:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# List of numeric and categorical features
numeric_features = ['tailleCm', 'poidsKg', 'IMC', 'nbGrossesse', 'nbEnfantsVivants',
                    'nbMacrosomies', 'nbAvortements', 'nbMortNes', 'ageMenopause',
                    'alcoolSemaine', 'nbCigaretteParJour', 'Age']
categorical_features = ['groupeSanguin', 'HTA', 'diabete', 'dyslipidemie', 'tabacStatus',
                        'drogue']


# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
numeric_transformed = numeric_transformer.fit_transform(X[numeric_features].to_numpy())


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_transformed  = categorical_transformer.fit_transform(X[categorical_features].to_numpy())

preprocessed_data = np.hstack((numeric_transformed, categorical_transformed.toarray()))


In [48]:


X_combined = np.hstack((preprocessed_data, descriptionsTokenizedPadded))
print(X_combined.shape)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=13)


(424, 330)


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(X_test.shape)

Accuracy: 0.6352941176470588
(85, 330)


In [50]:
## train all 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn_product = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on the training data
knn_product.fit(X_combined, y)

# Make predictions on the testing data
y_pred = knn_product.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(X_test.shape)



Accuracy: 0.8941176470588236
(85, 330)


In [51]:
distances, indices = knn.kneighbors(X_test, n_neighbors=10)
# Convert indices to actual predictions
top_k_predictions = np.array(y_train)[indices]


In [52]:
def top_n_accuracy(y_true, y_pred, n=3):
    top_n_correct = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred[:n]:
            top_n_correct += 1
    return top_n_correct / len(y_true)

def mean_reciprocal_rank(y_true, y_pred):
    rr_sum = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred:
            rr_sum += 1 / (pred.tolist().index(true) + 1)
    return rr_sum / len(y_true)

def precision_at_k(y_true, y_pred, k=3):
    precision_sum = 0
    for true, pred in zip(y_true, y_pred):
        if true in pred[:k]:
            precision_sum += 1
    r+eturn precision_sum / (k * len(y_true))

def coverage(y_pred, all_items, k=3):
    recommended_items = set()
    for pred in y_pred:
        recommended_items.update(pred[:k])
    return len(recommended_items) / len(all_items)

# Example: Evaluating the top 3 recommendations
top_3_acc = top_n_accuracy(y_test, top_k_predictions, n=5)
mrr = mean_reciprocal_rank(y_test, top_k_predictions)
precision_k = precision_at_k(y_test, top_k_predictions, k=5)
all_possible_items = set(y_train)  # Use the unique items from the training set as all possible items
coverage_score = coverage(top_k_predictions, all_possible_items, k=5)

# Print evaluation metrics
print(f"Top-3 Accuracy: {top_3_acc*100:.1f}%")
print(f"Mean Reciprocal Rank: {mrr*100:.1f}%")
print(f"Coverage at 3: {coverage_score*100:.1f}%")


Top-3 Accuracy: 92.9%
Mean Reciprocal Rank: 93.1%
Coverage at 3: 77.2%


# Save the model

In [55]:
import joblib
import pickle

joblib.dump(knn, 'modeldata/knn_model.joblib')
with open('modeldata/tokenizer_dict.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

In [54]:
numeric_transformer_file = 'modeldata/numeric_transformer.joblib'
categorical_transformer_file = 'modeldata/categorical_transformer.joblib'

joblib.dump(numeric_transformer,numeric_transformer_file)
joblib.dump(categorical_transformer,categorical_transformer_file)

['modeldata/categorical_transformer.joblib']