In [None]:
!pip install dill



In [None]:
!pip install bnlearn



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [None]:
!pip install /content/drive/MyDrive/Train\ expectation\ model\ /Installers/dexire-0.0.1-py3-none-any.whl

Processing ./drive/MyDrive/Train expectation model /Installers/dexire-0.0.1-py3-none-any.whl
dexire is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [None]:
from dexire.dexire import DEXiRE
import dexire.dexire as dex

In [None]:
import json
import os
import numpy as np
import pandas as pd
from scipy import stats
from glob import glob
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import seaborn as sns
import dill
dill.settings['recurse'] = True

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
from collections import Counter

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from typing import Tuple
from yellowbrick.cluster import KElbowVisualizer
from pathlib import Path
import pickle
from enum import Enum

class DimensionalityReduction(str, Enum):
    PCA = "PCA"
    TSNE = "TSNE"
    UMAP = "UMAP"
    KernelPCA = "KernelPCA"


class ClusterAnalysis:
    def __init__(self,
                 n_cluster:int=None,
                 normalize: bool = True) -> None:
        self.n_clusters = n_cluster
        self.cluster_model = None
        self.wcss = []
        self.normalize = normalize
        self.scaler = StandardScaler()

    def fit(self, X: np.array, n_clusters: int = None) -> None:
        if n_clusters is not None and n_clusters > 1:
            self.n_clusters = n_clusters
            self.cluster_model = KMeans(n_clusters=n_clusters)
            self.cluster_model.fit(X)

    def predict(self, X: np.array) -> np.array:
        """This function predict

        :param X: _description_
        :type X: np.array
        :raises Exception: _description_
        :return: _description_
        :rtype: np.array
        """
        if self.cluster_model is not None:
            try:
                check_is_fitted(self.cluster_model)
            except NotFittedError as exc:
                print(f"The  model have not been fitted and cannot be used for prediction.")
                raise Exception("The cluster number have not been set. Model cannot be fitted.")
            return self.cluster_model.predict(X)

    def automatically_choose_cluster_numbers(self,
                                             X: np.array,
                                             max_clusters: int = 11) -> bool:
        if self.normalize:
            X = self.scaler.fit_transform(X)
        possible_n_clusters = np.arange(2, max_clusters)
        for i in possible_n_clusters:
            kmeans = KMeans(n_clusters=i, init='k-means++',
                            max_iter=300,
                            n_init=10,
                            random_state=0)
            kmeans.fit(X)
            self.wcss.append(kmeans.inertia_)
        self.plot_elbow(X, max_cluster=max_clusters)
        # Calculate the deltas with first derivate
        delta_wcss = np.diff(self.wcss)
        # Second derivate difference
        delta2_wcss = np.diff(delta_wcss)
        # Find the index of the elbow point
        elbow_index = np.where(delta2_wcss > 0)[0][0] + 2
        best_cluster = possible_n_clusters[elbow_index]
        self.n_clusters = best_cluster
        # fit the model
        self.fit(X, self.n_clusters)
        return True

    def plot_elbow(self,
                   X: np.array,
                   out_file: Path = None,
                   max_cluster: int = 11) -> None:
        kmeans = KMeans()
        visualizer = KElbowVisualizer(kmeans, k=(2, max_cluster))
        visualizer.fit(X)  # Fit the data to the visualizer
        if out_file is not None:
            visualizer.show(outpath=out_file)
        else:
            visualizer.show()

    def plot_clusters(self,
                      X: np.array,
                      y: np.array,
                      dimensionality_reduction: str = DimensionalityReduction.PCA):
        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y have different shapes")
        if dimensionality_reduction == DimensionalityReduction.PCA:
            pca = PCA(n_components=2)
            X_new = pca.fit_transform(X)
        elif dimensionality_reduction == DimensionalityReduction.TSNE:
            tsne = TSNE(n_components=2, random_state=0)
            X_new = tsne.fit_transform(X)
        elif dimensionality_reduction == DimensionalityReduction.UMAP:
            reducer = umap.UMAP(n_components=2, random_state=0)
            X_new = reducer.fit_transform(X)
        elif dimensionality_reduction == DimensionalityReduction.KernelPCA:
            kpca = KernelPCA(n_components=2, kernel='rbf')
            X_new = kpca.fit_transform(X)
        else:
            raise ValueError(f"Dimensionality reduction {dimensionality_reduction} is not supported.")
        fig = plt.figure(figsize=(10, 10))
        ax = fig.add_subplot(1, 1, 1)
        scatter = ax.scatter(
            X_new[:, 0],
            X_new[:, 1],
            c=y,
            cmap='viridis',
            edgecolor='k',
            s=50
        )
        ax.set_title('Clusters')
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        plt.colorbar(scatter)
        plt.show()

    def save_cluster_model(self, file_path:str):
        # save_cluster_model method
        with open(file_path, 'wb') as f:
            pickle.dump(self.cluster_model, f)

    def load_cluster_model(self, path: str) -> None:
        # load_cluster_model method
        with open(path, 'rb') as f:
            self.cluster_model = pickle.load(f)

In [None]:
base_path = "/content/drive/MyDrive/Train expectation model /results"

## Load recipes

In [None]:
file_recipes_id = "1EN3b4zt69ItW4G0mg9DQ9OLACS6jO9nY" #"14XbCR2iwjuiHN6XvAKZvbYDkLaWDb0kS" #"11cOxIBXEgHKzDeuWK2RLZaj6UB6b5EE9" "1Fa0pSt3Q2yGhl3X_7dlQaY53Df85dEyW"
downloaded = drive.CreateFile({'id':file_recipes_id})
downloaded.GetContentFile('df_recipes.csv')

In [None]:
df_recipes = pd.read_csv('df_recipes.csv', sep="|", index_col=0)
df_recipes.head(4)

Unnamed: 0,name,raw_text,cultural_restriction,calories,allergens,recipeId,ingredients,instructions,carbohydrates,fat,fiber,protein,taste,cooking_style,meal_type,prep_time,cuisine,price,ingredients_list
0,Fruit Salad,"1. Fruit Salad: 70 calories per portion, 4 por...",vegan,70.0,NotAllergens,food_0,ingredients:\n- 1 apple\n- 1 banana\n- 1 orang...,\n1. wash and cut all the fruits into bite-siz...,223.0,2.0,0.0,15.0,sweet,mixed,fruit-based,15.0,International,2.0,"apple, banana, orange, grape, strawberry, pine..."
1,Vegan Omelette,"5. Vegan Omelette: 300 calories per portion, 1...",vegan,300.0,legumes,food_4,ingredients:\n- 1 cup chickpea flour\n- 1 cup ...,"\n1. in a mixing bowl, whisk together the chic...",100.0,20.5,6.0,15.0,salty,sauteed,veggie,20.0,", Vegan",2.0,"chickpea, flour, water, yeast, turmeric, garli..."
2,Vegan French Toast,7. Vegan French Toast: 400 calories per portio...,vegan,400.0,NotAllergens,food_6,ingredients:\n- 4 slices of vegan bread\n- 1 c...,"\n1. in a shallow dish, whisk together the alm...",115.0,38.5,11.8,17.0,sweet,sauteed,vegan,25.0,", French, Vegan",1.0,"vegan, bread, unsweetened, almond, milk, groun..."
3,Granola with Soy Milk,10. Granola with Soy Milk: 550 calories per po...,vegan,550.0,tree nuts,food_9,ingredients:\n- 3 cups rolled oats\n- 1 cup nu...,\n1. preheat your oven to 325°f (165°c).\n2. i...,13.1,26.0,1.5,5.5,sweet,baked,grain-based,25.0,International,2.0,"rolled, oat, nut, almond, walnut, pecan, dried..."


In [None]:
df_recipes.shape

(7017, 19)

In [None]:
# fill nan in columns
def check_nans(df_test):
  dict_col_nans = {}
  for col in df_test.columns:
    dict_col_nans[col] = sum(df_test[col].isna())
  return dict_col_nans

In [None]:
#Check recipes
check_nans(df_recipes)

{'name': 0,
 'raw_text': 0,
 'cultural_restriction': 0,
 'calories': 0,
 'allergens': 0,
 'recipeId': 0,
 'ingredients': 0,
 'instructions': 0,
 'carbohydrates': 0,
 'fat': 0,
 'fiber': 0,
 'protein': 0,
 'taste': 0,
 'cooking_style': 0,
 'meal_type': 0,
 'prep_time': 0,
 'cuisine': 0,
 'price': 0,
 'ingredients_list': 0}

## Load Datasets

In [None]:
# Replace the file ID with your own
file_id_train = "1NgfU1fHmA85jNOSC8OshNq2SxFJx_j9r" #'1RAN4QQNJm5g1GEsqZW3KbTdeYOxPkyJ2' https://drive.google.com/file/d/1NgfU1fHmA85jNOSC8OshNq2SxFJx_j9r/view?usp=drive_link
file_id_val = "1HelnJeFj0GiAGI3Fi7pFscULDY1SMOtC" #'1kES6RWac8K0HKbCaP_9FFF2Bca7a7pht' https://drive.google.com/file/d/1HelnJeFj0GiAGI3Fi7pFscULDY1SMOtC/view?usp=drive_link
file_id_test = "1GPpJNn4OdNs_Bkg3d1nnUooAoPoCqsdO" #'1LzS9puUVI2xQIIyVI-FK0Rpa8C2MmHdH' https://drive.google.com/file/d/1GPpJNn4OdNs_Bkg3d1nnUooAoPoCqsdO/view?usp=drive_link
link = 'https://drive.google.com/uc?id={file_id}'

In [None]:
# Train
downloaded = drive.CreateFile({'id':file_id_train})
downloaded.GetContentFile('train.csv')

# val
downloaded = drive.CreateFile({'id':file_id_val})
downloaded.GetContentFile('val.csv')

# Test
downloaded = drive.CreateFile({'id':file_id_test})
downloaded.GetContentFile('test.csv')

In [None]:
train = pd.read_csv('train.csv', sep="|", index_col=0)
print(train.shape)
val = pd.read_csv("val.csv",  sep="|", index_col=0)
print(val.shape)
test = pd.read_csv('test.csv',  sep="|", index_col=0)
print(test.shape)

(1024002, 47)
(204800, 47)
(51201, 47)


In [None]:
train['cultural_factor'].fillna("NotRestriction", inplace=True)
val['cultural_factor'].fillna("NotRestriction", inplace=True)
test['cultural_factor'].fillna("NotRestriction", inplace=True)

In [None]:
train['allergy'].fillna("NotAllergy", inplace=True)
val['allergy'].fillna("NotAllergy", inplace=True)
test['allergy'].fillna("NotAllergy", inplace=True)

In [None]:
check_nans(train)

{'day_number': 0,
 'meal_type_x': 0,
 'userId': 0,
 'foodId': 0,
 'time_of_meal_consumption': 0,
 'place_of_meal_consumption': 0,
 'social_situation_of_meal_consumption': 0,
 'appreciation_feedback': 0,
 'nutrition_goal': 0,
 'clinical_gender': 0,
 'age_range': 0,
 'life_style': 0,
 'weight': 0,
 'height': 0,
 'projected_daily_calories': 0,
 'current_daily_calories': 0,
 'country_of_origin': 0,
 'living_country': 0,
 'current_location': 0,
 'cultural_factor': 0,
 'probabilities': 811974,
 'allergy': 0,
 'Multi-allergy': 921530,
 'current_working_status': 0,
 'marital_status': 0,
 'ethnicity': 0,
 'BMI': 0,
 'next_BMI': 0,
 'name': 0,
 'raw_text': 0,
 'cultural_restriction': 0,
 'calories': 0,
 'allergens': 0,
 'recipeId': 0,
 'ingredients': 0,
 'instructions': 0,
 'carbohydrates': 0,
 'fat': 0,
 'fiber': 0,
 'protein': 0,
 'taste': 0,
 'cooking_style': 0,
 'meal_type_y': 0,
 'prep_time': 0,
 'cuisine': 0,
 'price': 0,
 'ingredients_list': 0}

In [None]:
check_nans(val)

{'day_number': 0,
 'meal_type_x': 0,
 'userId': 0,
 'foodId': 0,
 'time_of_meal_consumption': 0,
 'place_of_meal_consumption': 0,
 'social_situation_of_meal_consumption': 0,
 'appreciation_feedback': 0,
 'nutrition_goal': 0,
 'clinical_gender': 0,
 'age_range': 0,
 'life_style': 0,
 'weight': 0,
 'height': 0,
 'projected_daily_calories': 0,
 'current_daily_calories': 0,
 'country_of_origin': 0,
 'living_country': 0,
 'current_location': 0,
 'cultural_factor': 0,
 'probabilities': 162083,
 'allergy': 0,
 'Multi-allergy': 184731,
 'current_working_status': 0,
 'marital_status': 0,
 'ethnicity': 0,
 'BMI': 0,
 'next_BMI': 0,
 'name': 0,
 'raw_text': 0,
 'cultural_restriction': 0,
 'calories': 0,
 'allergens': 0,
 'recipeId': 0,
 'ingredients': 0,
 'instructions': 0,
 'carbohydrates': 0,
 'fat': 0,
 'fiber': 0,
 'protein': 0,
 'taste': 0,
 'cooking_style': 0,
 'meal_type_y': 0,
 'prep_time': 0,
 'cuisine': 0,
 'price': 0,
 'ingredients_list': 0}

In [None]:
check_nans(test)

{'day_number': 0,
 'meal_type_x': 0,
 'userId': 0,
 'foodId': 0,
 'time_of_meal_consumption': 0,
 'place_of_meal_consumption': 0,
 'social_situation_of_meal_consumption': 0,
 'appreciation_feedback': 0,
 'nutrition_goal': 0,
 'clinical_gender': 0,
 'age_range': 0,
 'life_style': 0,
 'weight': 0,
 'height': 0,
 'projected_daily_calories': 0,
 'current_daily_calories': 0,
 'country_of_origin': 0,
 'living_country': 0,
 'current_location': 0,
 'cultural_factor': 0,
 'probabilities': 40676,
 'allergy': 0,
 'Multi-allergy': 46185,
 'current_working_status': 0,
 'marital_status': 0,
 'ethnicity': 0,
 'BMI': 0,
 'next_BMI': 0,
 'name': 0,
 'raw_text': 0,
 'cultural_restriction': 0,
 'calories': 0,
 'allergens': 0,
 'recipeId': 0,
 'ingredients': 0,
 'instructions': 0,
 'carbohydrates': 0,
 'fat': 0,
 'fiber': 0,
 'protein': 0,
 'taste': 0,
 'cooking_style': 0,
 'meal_type_y': 0,
 'prep_time': 0,
 'cuisine': 0,
 'price': 0,
 'ingredients_list': 0}

## Load word embeddings

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
# Choose the embedding to load and
class ChooseFromDict:
  def __init__(self, options_dict, description):
    self.options_dict = options_dict
    # create selector
    layout = widgets.Layout(width='auto', height='40px') #set width and height
    self.selector = widgets.Dropdown(
    options=self.options_dict.keys(),
    description=description,
    # layout= layout
    )
    # selected embedding
    self.selected_value = self.selector.value
    self.value = self.options_dict[self.selected_value]
    self.loaded_value = None
    self.out = widgets.Output()
    self.selector.observe(self.on_change, names='value')

  def on_change(self, change):
    with self.out:
      print(f"Selected: {change['new']}")
    new_val = change['new']
    self.selected_value = new_val
    self.value = self.options_dict[new_val]
    with self.out:
      print("loading...")
    self.load_embedding()

  def load_embedding(self):
    self.loaded_value = dict(np.load(self.value))
    with self.out:
      print(f"Embedding size: {len(self.loaded_value)}")

  def display(self):
    self.out.clear_output()
    with self.out:
      display(self.selector)
    return self.out


embedding_list = ['cbow', 'skip', 'doc2vec', 'use', 'bert']
paths = [
    "/content/drive/MyDrive/Train expectation model /results/cbow_recipe_embedding_v2_17_may.npz",
    "/content/drive/MyDrive/Train expectation model /results/skip_recipe_embedding_v2_17_may.npz",
    "/content/drive/MyDrive/Train expectation model /results/doc2vec_embedding_v2_17_may.npz",
    "/content/drive/MyDrive/Train expectation model /results/full_recipes_sentence_embedding_USE_v3.npz",
    "/content/drive/MyDrive/Train expectation model /results/full_recipe_embedding_BERT_v2_17_may_recipeId.npz"
]
selection_dict = dict(zip(embedding_list, paths))

chosser = ChooseFromDict(selection_dict, "Choose the embedding:")

# selector_embedding = widgets.Dropdown(
#     options=embedding_list,
#     description="Choose the embedding:"
# )
# selected_key = ""
# dict_embedding_cbow = None

# def on_change(change):
#   key = change['new']
#   path = selection_dict[key]
#   dict_embeddings_cbow = dict(np.load(path))
#   selected_key = key
#   print(f"Embedding size: {len(dict_embeddings_cbow)}")

# selector_embedding.observe(on_change, names='value')
# display(selector_embedding)

In [None]:
chosser.display()

Output()

## Dataset preprocessing and preparation

In [None]:
def generate_supervision(vector_row: np.array):
  index = np.argmax(vector_row)
  if index < 2:
    return 0.0
  else:
    return 1.0

In [None]:
def generate_label(value: float):
  if value >= 0.5:
    return 1.0
  else:
    return 0.0

In [None]:
# load full models
dir_list = glob(os.path.join(base_path, "rules","new_experiments_2", "*/"), recursive = True)

In [None]:
dir_list

['/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/cbow/',
 '/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/skip/',
 '/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/doc2vec/',
 '/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/use/',
 '/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/bert/']

In [None]:
dir_list[-1]

'/content/drive/MyDrive/Train expectation model /results/rules/new_experiments_2/bert/'

In [None]:
rule_list = glob(os.path.join(dir_list[-1], '*.pkl'))

In [None]:
# load one rule set
rule_set = None
with open(rule_list[0], 'rb') as fp:
  rule_set = dill.load(fp)

In [None]:
# load data
chosen_embedding = chosser.selected_value
idx = 0
model_type = "Full_model"
path_to_data = os.path.join(base_path, "experimental_data", f"experimental_data_{chosen_embedding}_{idx}_{model_type}.npz")
data = np.load(path_to_data)
x_te_xai = data["x_te_xai"]
feature_name = data["feat_names"]
data_indices = data["test_index"]
y_te_rules = data["y_te_rules"]
data.close()

In [None]:
# data preprocessing
numeric_features = ['calories',
 'carbohydrates',
 'current_daily_calories',
 'fat',
 'fiber',
 'height',
 'price',
 'projected_daily_calories',
 'protein',
 'weight',
'day_number',
'time_of_meal_consumption']
categorical_features = [
  'BMI',
  'age_range',
  'allergens',
  'allergy',
  'clinical_gender',
  'cultural_factor',
  'cultural_restriction',
  'current_working_status',
  'ethnicity',
  'life_style',
  'marital_status',
  'next_BMI',
  'nutrition_goal',
  'meal_type_y',
  'taste',
  'place_of_meal_consumption',
  'social_situation_of_meal_consumption'
 ]
embedding_features = ['cluster']

In [None]:
selected_df = test.loc[data_indices]

In [None]:
selected_df.shape

(5108, 47)

In [None]:
selected_df["y_pred"] = y_te_rules

In [None]:
# negative
dislike_mask = selected_df["y_pred"] == 0
print(f"dislike: {sum(dislike_mask)}")
# positive
like_mask = selected_df["y_pred"] == 1
print(f"like: {sum(like_mask)}")

dislike: 1529
like: 3579


In [None]:
users = selected_df["userId"].unique()

In [None]:
test_user = users[0]

In [None]:
df_user = selected_df.loc[selected_df["userId"]==test_user, :]

In [None]:
dislike_np_index = np.where(y_te_rules == 0)[0]
like_np_index = np.where(y_te_rules == 1)[0]

In [None]:
test_xai_dislike = x_te_xai[dislike_np_index, :]
test_xai_like = x_te_xai[like_np_index[:3], :]

In [None]:
rule_path_dislike = rule_set.predict_numpy_rules(test_xai_dislike, return_decision_path=True)
rule_path_like = rule_set.predict_numpy_rules(test_xai_like, return_decision_path=True)

In [None]:
like_dict_rules = dict(zip(like_np_index[:3], rule_path_like[1]))

In [None]:
dislike_dict_rules = dict(zip(dislike_np_index, rule_path_dislike[1]))

In [None]:
like_dict_rules

{2: [IF ((onehot__cultural_restriction_vegan > 0.5) AND (onehot__meal_type_y_NotInformation <= 0.5) AND (onehot__taste_sour <= 0.5) AND (scaler__fiber <= 2.193) AND (scaler__calories <= 0.647) AND (onehot__allergens_gluten <= 0.5) AND (onehot__taste_umami <= 0.5) AND (scaler__carbohydrates > -0.773) AND (onehot__allergens_wheat <= 0.5) AND (scaler__fat > -0.791) AND (scaler__protein <= 1.324) AND (scaler__calories > -0.018) AND (scaler__fat > -0.464) AND (scaler__fiber <= 1.225) AND (onehot__allergens_legumes <= 0.5) AND (onehot__meal_type_y_breakfast <= 0.5) AND (onehot__allergens_sesame <= 0.5) AND (scaler__fat <= 3.082) AND (cluster > 1.5) AND (scaler__carbohydrates <= 2.143) AND (onehot__allergens_peanuts <= 0.5) AND (scaler__fat <= 0.284) AND (scaler__protein > -0.615) AND (scaler__fiber > -0.919) AND (scaler__carbohydrates > -0.747) AND (scaler__fiber <= 0.346) AND (scaler__fat <= 0.13) AND (scaler__calories <= 0.382) AND (onehot__meal_type_y_vegan <= 0.5)) THEN 1],
 3: [IF ((one

In [None]:
selected_df.loc[data_indices[4], :]

day_number                                                                            384
meal_type_x                                                                        dinner
userId                                                                          casey9b1c
foodId                                                                            food_24
time_of_meal_consumption                                                         17.78223
place_of_meal_consumption                                                            home
social_situation_of_meal_consumption                                                alone
appreciation_feedback                                                            0.650034
nutrition_goal                                                               maintain_fit
clinical_gender                                                                         F
age_range                                                                           50-59
life_style

In [None]:
dislike_dict_rules

{0: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 1: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 18: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 19: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 21: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 25: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 34: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 36: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 38: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 39: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 43: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 48: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 53: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 55: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 57: [IF ((onehot__cultural_restriction_vegan <= 0.5)) THEN 0],
 58: [IF ((onehot__cultural_restriction_ve

In [None]:
dislike_dict_rules[464]

[IF ((onehot__cultural_restriction_vegan > 0.5) AND (onehot__meal_type_y_NotInformation <= 0.5) AND (onehot__taste_sour <= 0.5) AND (scaler__fiber <= 2.193) AND (scaler__calories <= 0.647) AND (onehot__allergens_gluten <= 0.5) AND (onehot__taste_umami > 0.5)) THEN 0]

In [None]:
selected_df.loc[data_indices[464], :]

day_number                                                                            134
meal_type_x                                                                morning snacks
userId                                                                            mr.a40e
foodId                                                                          food_2144
time_of_meal_consumption                                                          8.63876
place_of_meal_consumption                                                            home
social_situation_of_meal_consumption                                              friends
appreciation_feedback                                                            0.492075
nutrition_goal                                                               maintain_fit
clinical_gender                                                                         M
age_range                                                                           40-49
life_style

In [None]:
from typing import List

In [None]:
from typing import List, Dict
import pandas as pd
import numpy as np
import bnlearn as bn

def create_hierarchical_edges(parents_list: List[str], child_node: str):
  list_of_edges = []
  for parent in parents_list:
    list_of_edges.append((parent, child_node))
  return list_of_edges


def create_bayesian_model_from_data(data_set: pd.DataFrame, wlist=None, edges=None):
    model = bn.structure_learning.fit(data_set, white_list=wlist, bw_list_method='nodes')
    if edges is not None:
      model['model'].add_edges_from(edges)
    model = bn.parameter_learning.fit(model, data_set)
    return model

In [None]:
def discretize_data(data: pd.DataFrame,
                    continuos_variables: List[str],
                    nbins: int = 10,
                    invert_order=True) -> Tuple[pd.DataFrame, Dict, Dict]:
  local_data = data.copy()
  discretize_transformer = {}
  dict_posible_values = {}
  for fac in continuos_variables:
    # discretizate
    array_values = data[fac].to_numpy()
    min = np.amin(array_values)
    max = np.amax(array_values)
    bins = np.linspace(min, max, num=nbins)
    discretize_transformer[fac] = bins
    discretized_data = np.digitize(array_values, bins, right=True)
    local_data[fac] = discretized_data
    # search for values
    dict_posible_values[fac] = sorted(np.unique(local_data[fac]))
  # return Bayesian network
  return local_data, discretize_transformer, dict_posible_values

In [None]:
discretized_df, discretize_dict, possible_vals = discretize_data(data=selected_df,
                    continuos_variables=numeric_features)

In [None]:
discretized_df['cluster'] = x_te_xai[:, -1]

In [None]:
factor_df = discretized_df.loc[:, categorical_features+numeric_features+['cluster', 'y_pred']]

In [None]:
factor_df.columns

Index(['BMI', 'age_range', 'allergens', 'allergy', 'clinical_gender',
       'cultural_factor', 'cultural_restriction', 'current_working_status',
       'ethnicity', 'life_style', 'marital_status', 'next_BMI',
       'nutrition_goal', 'meal_type_y', 'taste', 'place_of_meal_consumption',
       'social_situation_of_meal_consumption', 'calories', 'carbohydrates',
       'current_daily_calories', 'fat', 'fiber', 'height', 'price',
       'projected_daily_calories', 'protein', 'weight', 'day_number',
       'time_of_meal_consumption', 'cluster', 'y_pred'],
      dtype='object')

In [None]:
def generate_edges(causal_factors, effect):
  edges = []
  for cause in causal_factors:
    edges.append((cause, effect))
  return edges

def from_effect_to_cause(causal_factors, effect):
  edges = []
  for cause in causal_factors:
    edges.append((effect, cause))
  return edges


In [None]:
# list of edges
causes = ['BMI', 'age_range', 'allergens', 'allergy', 'clinical_gender',
       'cultural_factor', 'cultural_restriction','life_style', 'next_BMI',
       'nutrition_goal', 'meal_type_y', 'taste', 'time_of_meal_consumption',
       'place_of_meal_consumption', 'social_situation_of_meal_consumption',
       'calories', 'carbohydrates', 'current_daily_calories', 'fat', 'fiber',
       'height', 'price', 'projected_daily_calories', 'protein', 'weight',
       'day_number', 'cluster',]
effect='y_pred'
edges_list = generate_edges(causes, effect)

In [None]:
context_edges = from_effect_to_cause(['time_of_meal_consumption',
       'place_of_meal_consumption', 'social_situation_of_meal_consumption'], effect)

In [None]:
# load model

In [None]:
bn_control_model = bn.structure_learning.fit(factor_df, white_list=context_edges, bw_list_method='edges')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Filter edges based on black_list/white_list
[bnlearn] >Compute structure scores for model comparison (higher is better).


In [None]:
bn_control_model = bn.make_DAG(context_edges)

[bnlearn] >bayes DAG created.


In [None]:
bn_control_model = bn.parameter_learning.fit(bn_control_model, factor_df.copy())

[bnlearn] >Removing columns from dataframe to make consistent with DAG [['BMI' 'age_range' 'allergens' 'allergy' 'clinical_gender'
 'cultural_factor' 'cultural_restriction' 'current_working_status'
 'ethnicity' 'life_style' 'marital_status' 'next_BMI' 'nutrition_goal'
 'meal_type_y' 'taste' 'calories' 'carbohydrates' 'current_daily_calories'
 'fat' 'fiber' 'height' 'price' 'projected_daily_calories' 'protein'
 'weight' 'day_number' 'cluster']]
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >CPD of y_pred:
+-----------+----------+
| y_pred(0) | 0.332187 |
+-----------+----------+
| y_pred(1) | 0.667813 |
+-----------+----------+
[bnlearn] >CPD of time_of_meal_consumption:
+-----------------------------+----------------------+---------------------+
| y_pred                      | y_pred(0)            | y_pred(1)           |
+-----------------------------+----------------------+---------------------+
| time_of_meal_consumption(0) | 0.02464268112370626  | 0.012

In [None]:
bn_model = create_bayesian_model_from_data(factor_df)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.
[bnlearn] >CPD of BMI:
+------------------+--------------------+-----+--------------------+---------------------+
| height           | height(0)          | ... | height(9)          | height(9)           |
+------------------+--------------------+-----+--------------------+---------------------+
| weight           | weight(0)          | ... | weight(8)          | weight(9)           |
+------------------+--------------------+-----+--------------------+---------------------+
| BMI(healthy)     | 0.4094488188976378 | ... | 0.1453488372093023 | 0.09225092250922508 |
+------------------+--------------------+-----+--------------------+--------------

In [None]:
evidence_like = factor_df.loc[data_indices[4], :].to_dict()
evidence_dislike = factor_df.loc[data_indices[464], :].to_dict()

In [None]:
def print_CPDS(causes, evidence_dict, model):
  nodes_to_exclude = []
  for cause in causes:
    if cause not in model['model'] and cause in evidence_dict.keys():
      del evidence_dict[cause]
    nodes_to_exclude.append(cause)
  print(f"Excluded nodes: {nodes_to_exclude}")
  for cause in evidence_dict.keys():
    print(f"Cause: {cause}:")
    if cause in evidence_dict.keys():
      tem_evident_dict = evidence_dict.copy()
      del tem_evident_dict[cause]
      cpd = bn.inference.fit(model, variables=[cause], evidence=tem_evident_dict)
      print(cpd)
    print("------------------------------------------")

In [None]:
print_CPDS(causes, evidence_like, bn_model)

Excluded nodes: ['BMI', 'age_range', 'allergens', 'allergy', 'clinical_gender', 'cultural_factor', 'cultural_restriction', 'life_style', 'next_BMI', 'nutrition_goal', 'meal_type_y', 'taste', 'time_of_meal_consumption', 'place_of_meal_consumption', 'social_situation_of_meal_consumption', 'calories', 'carbohydrates', 'current_daily_calories', 'fat', 'fiber', 'height', 'price', 'projected_daily_calories', 'protein', 'weight', 'day_number', 'cluster']
Cause: BMI:
[bnlearn] >Variable Elimination.
[bnlearn] >Data is stored in [query.df]
+----+-------------+-------------+
|    | BMI         |           p |
|  0 | healthy     | 0.999268    |
+----+-------------+-------------+
|  1 | obesity     | 0.000479291 |
+----+-------------+-------------+
|  2 | overweight  | 0.000142167 |
+----+-------------+-------------+
|  3 | underweight | 0.000110284 |
+----+-------------+-------------+
+------------------+------------+
| BMI              |   phi(BMI) |
| BMI(healthy)     |     0.9993 |
+----------

In [None]:
#predict
bn.inference.fit(bn_model, variables=['y_pred'], evidence=evidence_like)

[bnlearn] >Variable Elimination.
[bnlearn] >Data is stored in [query.df]
+----+----------+------------+
|    |   y_pred |          p |
|  0 |        0 | 0.00194786 |
+----+----------+------------+
|  1 |        1 | 0.998052   |
+----+----------+------------+


<DiscreteFactor representing phi(y_pred:2) at 0x7e8a227ba380>

In [None]:
#predict
ev = evidence_dislike.copy()
del ev['y_pred']
bn.inference.fit(bn_model, variables=['y_pred'], evidence=ev)

[bnlearn] >Variable Elimination.
[bnlearn] >Data is stored in [query.df]
+----+----------+-----------+
|    |   y_pred |         p |
|  0 |        0 | 0.0386463 |
+----+----------+-----------+
|  1 |        1 | 0.961354  |
+----+----------+-----------+


<DiscreteFactor representing phi(y_pred:2) at 0x7ce808aead70>

In [None]:
def inverse_standarization(value, mean, std):
  return value*std+mean

In [None]:
denorm_values = {}
for num in numeric_features:
  denorm_values[num] = {'mean': np.mean(train[num]), 'std': np.std(train[num])}

In [None]:
denorm_values

{'calories': {'mean': 571.4349842480779, 'std': 714.3788483376305},
 'carbohydrates': {'mean': 107.38713835129225, 'std': 129.98057203455144},
 'current_daily_calories': {'mean': 2263.42573452005,
  'std': 573.5062952549755},
 'fat': {'mean': 41.901670375741425, 'std': 50.501299734256946},
 'fiber': {'mean': 10.857418776525828, 'std': 10.766843412056323},
 'height': {'mean': 165.30459510821268, 'std': 11.611385395816972},
 'price': {'mean': 1.9086393385950418, 'std': 0.2974992833361969},
 'projected_daily_calories': {'mean': 2014.0391952359469,
  'std': 536.7625561351381},
 'protein': {'mean': 24.83870909724786, 'std': 40.13231411680017},
 'weight': {'mean': 70.81114269308068, 'std': 16.605729347679265},
 'day_number': {'mean': 364.46053718645084, 'std': 210.77493117767492},
 'time_of_meal_consumption': {'mean': 12.9913759746112,
  'std': 4.549567423495502}}

In [None]:
inverse_standarization(2.193, denorm_values['fiber']['mean'], denorm_values['fiber']['std'])

34.469106379165346

In [None]:
feat='calories'
inverse_standarization(0.647, denorm_values[feat]['mean'], denorm_values[feat]['std'])

1033.6380991225249

In [None]:
# save model
path_model = os.path.join(base_path, "bayesian_model", f"bn_model_new_experiment_model_bert.pkl")
bn.save(bn_model, path_model, overwrite=True)

[pypickle] Pickle file saved: [/content/drive/MyDrive/Train expectation model /results/bayesian_model/bn_model_new_experiment_model_bert.pkl]


True

In [None]:
factor_df.loc[data_indices[464], :]

BMI                                                  healthy
age_range                                              40-49
allergens                                       NotAllergens
allergy                                             Multiple
clinical_gender                                            M
cultural_factor                         vegetarian_observant
cultural_restriction                                   vegan
current_working_status                      Half-time-worker
ethnicity                                             Latino
life_style                                       Very active
marital_status                                       Married
next_BMI                                             healthy
nutrition_goal                                  maintain_fit
meal_type_y                                           veggie
taste                                                  umami
place_of_meal_consumption                               home
social_situation_of_meal

In [None]:
factor_df.loc[data_indices[4], :]

BMI                                              healthy
age_range                                          50-59
allergens                                      tree nuts
allergy                                            wheat
clinical_gender                                        F
cultural_factor                          flexi_observant
cultural_restriction                               vegan
current_working_status                  Full-time-worker
ethnicity                                          White
life_style                                   Very active
marital_status                                   Married
next_BMI                                      overweight
nutrition_goal                              maintain_fit
meal_type_y                                   vegetarian
taste                                              sweet
place_of_meal_consumption                           home
social_situation_of_meal_consumption               alone
calories                       

In [None]:
base_path

'/content/drive/MyDrive/Train expectation model /results'

In [None]:
print(bn_rule_model.get_cpds())

NameError: name 'bn_rule_model' is not defined

In [None]:
bn_rule_model.save('bayesian_explanation.bif', filetype='bif')

In [None]:
def process_names(name:str, numeric_features, categorical_features):
  new_name = name
  new_name = new_name.replace('scaler__', '')
  new_name = new_name.replace('onehot__', '')
  for num in numeric_features:
    check_presence = name.find(num)
    if check_presence != -1:
      new_name = name.split('_')[-1]
      return num
  for cat in categorical_features:
    check_presence = name.find(cat)
    if check_presence != -1:
      feat_name = name[check_presence:check_presence+len(cat)]
      conclusion = name[check_presence+len(cat)+1:]
      new_name = f"{feat_name} = {conclusion}"
      return cat
  new_name = new_name.replace('scaler__', '')

  return new_name

## Preprocess data

In [None]:
import string
import re
def replace_bad_characters(text: str):
  new_text = text.lower()
  new_text = new_text.replace('.', '')
  new_text = re.sub(' +', '_', new_text)
  new_text = new_text.replace(';', '_')
  new_text = new_text.replace(',', '_')
  new_text = new_text.replace(' ', '_')
  new_text = new_text.replace('/', '_')
  new_text = new_text.replace('-', '_')
  #new_text = text.translate(str.maketrans('', '', string.punctuation))
  return new_text

In [None]:
preprocessor = ColumnTransformer(
    [
        ('onehot', OneHotEncoder(), categorical_features),
        ('scaler', StandardScaler(), numeric_features)
    ],
    remainder='drop'
)

In [None]:
# sample de data
train_sample = train.sample(10000, random_state=41)

In [None]:
# load embeddings
chosen_embedding = chosser.selected_value
model_type = "Baseline"
base_path = '/content/drive/MyDrive/Train expectation model /results'
ca = ClusterAnalysis()
cluster_path = os.path.join(base_path,"cluster_analysis", f"new_experiment_{chosen_embedding}_cluster_{model_type}.pkl")
ca.load_cluster_model(cluster_path)

In [None]:
emb = chosser.loaded_value

In [None]:
train_sample['embedding'] = train_sample['recipeId'].apply(lambda x: emb.get(x, np.NaN))
mask = train_sample['embedding'].isna()
print(f"number empty: {sum(mask)}")
filtered_sample = train_sample.loc[~mask, :]

number empty: 32


In [None]:
X = np.vstack(filtered_sample['embedding'].tolist())

In [None]:
X.shape

(9968, 768)

In [None]:
filtered_sample['meal_type_y'] = filtered_sample['meal_type_y'].apply(lambda x: replace_bad_characters(x))

In [None]:
def replace_not_information(x: str):
  if x == "notinformation":
    return "NotInformation"
  else:
    return x

In [None]:
filtered_sample['meal_type_y'] = filtered_sample['meal_type_y'].apply(lambda x: replace_not_information(x))

In [None]:
cluster_predictions = ca.predict(X)

In [None]:
np.unique(cluster_predictions, return_counts=True)

(array([0, 1, 3], dtype=int32), array([   8, 9607,  353]))

In [None]:
in_features = preprocessor.fit_transform(filtered_sample)

In [None]:
dense_feat = in_features.toarray()

In [None]:
cluster_predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int32)

In [None]:
final_feature = np.column_stack((dense_feat, cluster_predictions))

In [None]:
final_feature.shape

(9968, 221)

In [None]:
pre_features_names = preprocessor.get_feature_names_out()

In [None]:
rules = rule_set.get_rules()

In [None]:
features_names = [r.get_feature_name() for r in rules]

In [None]:
features_names = {}
for r in rules:
  features_names.update(dict(zip(r.get_feature_name(), r.get_feature_idx())))

In [None]:
set(features_names)-set(pre_features_names)

{'cluster'}

In [None]:
feature_name_to_idx = {}
for key in features_names.keys():
  if key != 'cluster':
    feature_name_to_idx[key] = pre_features_names.tolist().index(key)
  else:
    feature_name_to_idx[key] = len(pre_features_names)

In [None]:
def update_rules_index(rule_list, dict_feature_idx):
  new_rule_list = []
  for rule in rule_list:
    clauses = rule.premise.clauses
    for expr in clauses:
      expr.feature_idx = dict_feature_idx.get(expr.get_feature_name())


In [None]:
rule_set.rules[0].premise.clauses[0].feature_idx =

127