Loading and Cleaning Data

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

file_path = "/content/drive/My Drive/cleaned_data_combined_modified.csv"
df = pd.read_csv(file_path)

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q3: In what setting would you expect this food to be served? Please check all that apply,Q4: How much would you expect to pay for one serving of this food item?,Q5: What movie do you think of when thinking of this food item?,Q6: What drink would you pair with this food item?,"Q7: When you think about this food item, who does it remind you of?",Q8: How much hot sauce would you add to this food item?,Label
0,716549,3,6,"Week day lunch,At a party,Late night snack",5,Cloudy with a Chance of Meatballs,Coke,Friends,A little (mild),Pizza
1,715742,4,"bread, meet","Week day lunch,At a party,Late night snack",5$ for a large piece,All sort of american young boy movies,Coke,"Friends,Teachers,Strangers",,Pizza
2,727333,3,5,"Week day lunch,Week day dinner,Weekend lunch,W...",10dollar,action movie,cola,Friends,A moderate amount (medium),Pizza
3,606874,4,7-Jun,"Week day lunch,Week day dinner,Weekend lunch,W...",$3,Mamma Mia,Soda,"Siblings,Friends,Teachers",I will have some of this food item with my hot...,Pizza
4,505318,2,3 or more,"Week day lunch,Week day dinner,Weekend lunch,W...",$5,Cloudy with a chance of meatballs,Soda,"Siblings,Friends",A little (mild),Pizza


In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("/content/drive/My Drive/cleaned_data_combined_modified.csv")

def extract_numeric(value):
    if pd.isnull(value):
        return None
    value = str(value).strip().lower()
    value = re.sub(r'[^\d\.\-]', ' ', value)
    value = re.sub(r'\s+', ' ', value).strip()

    if '-' in value:
        numbers = [float(num) for num in value.split('-') if num.strip().isdigit()]
        if numbers:
            return sum(numbers) / len(numbers)

    match = re.search(r'\d+(\.\d+)?', value)
    return float(match.group()) if match else None

numerical_columns = [
    "Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",
    "Q2: How many ingredients would you expect this food item to contain?",
    "Q4: How much would you expect to pay for one serving of this food item?"
]

for col in numerical_columns:
    df[col] = df[col].apply(extract_numeric)

df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

text_cols = ["Q3: In what setting would you expect this food to be served? Please check all that apply",
             "Q5: What movie do you think of when thinking of this food item?",
             "Q6: What drink would you pair with this food item?",
             "Q7: When you think about this food item, who does it remind you of?"]

df[text_cols] = df[text_cols].fillna("none").astype(str).apply(lambda x: x.str.lower().str.strip())

vectorizer_q3 = CountVectorizer(binary=True)
vectorizer_q5 = CountVectorizer(binary=True)
vectorizer_q6 = CountVectorizer(binary=True)
vectorizer_q7 = CountVectorizer(binary=True)

Q3_bow = vectorizer_q3.fit_transform(df["Q3: In what setting would you expect this food to be served? Please check all that apply"])
Q5_bow = vectorizer_q5.fit_transform(df["Q5: What movie do you think of when thinking of this food item?"])
Q6_bow = vectorizer_q6.fit_transform(df["Q6: What drink would you pair with this food item?"])
Q7_bow = vectorizer_q7.fit_transform(df["Q7: When you think about this food item, who does it remind you of?"])

df_q3_bow = pd.DataFrame(Q3_bow.toarray(), columns=[f"Q3_{word}" for word in vectorizer_q3.get_feature_names_out()])
df_q5_bow = pd.DataFrame(Q5_bow.toarray(), columns=[f"Q5_{word}" for word in vectorizer_q5.get_feature_names_out()])
df_q6_bow = pd.DataFrame(Q6_bow.toarray(), columns=[f"Q6_{word}" for word in vectorizer_q6.get_feature_names_out()])
df_q7_bow = pd.DataFrame(Q7_bow.toarray(), columns=[f"Q7_{word}" for word in vectorizer_q7.get_feature_names_out()])

df = pd.concat([df, df_q3_bow, df_q5_bow, df_q6_bow, df_q7_bow], axis=1)

df.drop(columns=text_cols, inplace=True)

hot_sauce_map = {
    "A little (mild)": "Mild",
    "A moderate amount (medium)": "Medium",
    "A lot (hot)": "Hot",
    "I will have some of this food item with my hot sauce": "Medium"
}

df["Q8_cleaned"] = df["Q8: How much hot sauce would you add to this food item?"].map(hot_sauce_map)
df["Q8_cleaned"].fillna("None", inplace=True)
df = pd.get_dummies(df, columns=["Q8_cleaned"])
df.drop(columns=["Q8: How much hot sauce would you add to this food item?"], inplace=True)

df.to_csv("/content/drive/My Drive/cleaned_data_bow.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Q8_cleaned"].fillna("None", inplace=True)


# Feature Selection Functions

First, we implement a chi2 function to determine best features.

This includes calculation of p values.

In [None]:
import numpy as np
import random

def gamma(x):
    """Gamma function using Stirling's approximation for large x."""
    if x < 0:
        raise ValueError("x must be positive")
    elif x == 1 or x == 2:
        return 1
    else:
        # Stirling's approximation for large x
        return np.sqrt(2 * np.pi * x) * (x / np.e) ** x

def chi_squared_cdf(x, df, num_steps=1000):
    """
    Approximate the CDF of the chi-squared distribution using numerical integration.
    This is based on the Gamma distribution CDF approximation.

    x: chi-squared statistic
    df: degrees of freedom
    num_steps: number of steps in the numerical integration
    """
    # Gamma function approximation using the incomplete gamma function (via numerical integration)
    step_size = x / num_steps
    integral = 0.0
    for i in range(num_steps):
        t = i * step_size
        # Gamma probability density function (PDF) for chi-squared distribution
        gamma_pdf = (t ** (df / 2 - 1) * np.exp(-t / 2)) / (2 ** (df / 2) * gamma(df / 2))
        integral += gamma_pdf * step_size
    return integral

def chi_squared_p_value(chi2_stats, df):
    """
    Calculate the p-value for a vector of chi-squared statistics based on the chi-squared CDF.
    chi2_stats: a vector of chi-squared statistics
    df: degrees of freedom
    """
    p_values = []
    for chi2_stat in chi2_stats:
        cdf_value = chi_squared_cdf(chi2_stat, df)
        p_value = 1 - cdf_value
        p_values.append(p_value)
    return np.array(p_values)

def chi2_statistic(X, y):
    """
    Compute the chi-squared statistic for each feature in X with respect to the target vector y.

    Parameters:
    X: numpy array of shape (n_samples, n_features), feature matrix
    y: numpy array of shape (n_samples,), target vector (class labels)

    Returns:
    chi2_stats: numpy array of shape (n_features,), chi-squared statistics for each feature
    """
    X = np.array(X)
    y = np.array(y)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))  # Number of unique classes in the target vector

    chi2_stats = np.zeros(n_features)  # Array to store chi-squared statistics for each feature

    # Loop over each feature to calculate its chi-squared statistic
    for i in range(n_features):
        observed = np.zeros((n_classes, 2))  # Observed counts for each class (rows) and feature (columns)
        for j in range(n_samples):
            match y[j]:
              case "Pizza":
                  y_index = 0
              case "Sushi":
                  y_index = 1
              case "Shawarma":
                  y_index = 2
            observed[y_index, int(X[j, i])] += 1  # Increment the count for the corresponding class and feature value

        # Calculate row sums and column sums
        row_sums = observed.sum(axis=1)  # Sum of each class (target)
        col_sums = observed.sum(axis=0)  # Sum of each feature value
        total_sum = observed.sum()  # Total sum of all counts

        # Calculate expected frequencies
        expected = []
        for k in range(3):
          expected.append([0.00000001 if x == 0 else x for x in (np.outer(row_sums, col_sums) / total_sum)[k]])
        expected = np.array(expected)
        # Calculate chi-squared statistic for the feature
        chi2_stat = np.sum((observed - expected) ** 2 / expected)
        chi2_stats[i] = chi2_stat

    return np.array(chi2_stats), chi_squared_p_value(chi2_stats, len(observed) - 1)

Select the K best features; ie the features with the highest chi2 values.

In [None]:
def select_k_best(X, y, k, classification="chi2"):
  chi2_vals, p_vals = chi2_statistic(X, y)
  if classification == "chi2":
    k_best = np.argpartition(chi2_vals, -1*k)[-1*k:]
  if classification == "p":
    k_best = np.argpartition(p_vals, k)[:k]

  columns = X.shape[1]
  for i in range(columns):
    index = columns - i - 1
    if index not in k_best:
      X = np.delete(X, index, 1)
  return X

# Model Implementation
Naive Bayes class implementation.

In [None]:
class NaiveBayes():
  def __init__(self, *, a=2, b=2, split=90, N=100) -> None:
     self.a = a
     self.b = b
     self.split = split
     self.N = int(N)

  def _map_pi_theta(self, X, y):
    a = self.a
    b = self.b
    N, vocab_size = X.shape[0], X.shape[1]
    pi = 0
    theta = np.zeros([vocab_size, 3])

    X_pizza = X[y == "Pizza"]
    X_sushi = X[y == "Sushi"]
    X_shawarma = X[y == "Shawarma"]

    N_pizza = X_pizza.shape[0]
    N_sushi = X_sushi.shape[0]
    N_shawarma = X_shawarma.shape[0]

    theta[:, 0] = (np.matmul(np.transpose(X_pizza), np.ones(N_pizza)) + a - 1) / (N_pizza + a + b - 2)
    theta[:, 1] = (np.matmul(np.transpose(X_sushi), np.ones(N_sushi)) + a - 1) / (N_sushi + a + b - 2)
    theta[:, 2] = (np.matmul(np.transpose(X_shawarma), np.ones(N_shawarma)) + a - 1) / (N_shawarma + a + b - 2)

    pi = [N_pizza/N, N_sushi/N, N_shawarma/N]

    return pi, theta

  def _training_subset(self, X, y):
    percent_split = self.split
    X_random = np.array(X.copy())
    y_random = np.array(y.copy())

    p = np.random.permutation(len(y_random))
    X_random, y_random = X_random[p], y_random[p]

    slice1 = int(np.floor(percent_split * len(y_random) / 100))
    return X_random[:slice1], y_random[:slice1]

  def _single_prediction(self, X, pi, theta, random):
    results = []

    log_shawarma = np.matmul(X, np.log(theta[:, 2])) + np.matmul(1-X, np.log(1-theta[:,2]))
    results.append(pi[2] * np.exp(log_shawarma))
    log_pizza = np.matmul(X, np.log(theta[:, 0])) + np.matmul(1-X, np.log(1-theta[:,0]))
    results.append(pi[0] * np.exp(log_pizza))
    log_sushi = np.matmul(X, np.log(theta[:, 1])) + np.matmul(1-X, np.log(1-theta[:,1]))
    results.append(pi[1] * np.exp(log_sushi))

    if random == True:
      y = self.argmax_with_tie_breaking(results)
    else:
      y = np.argmax(results, axis = 0)
    return y

  def argmax_with_tie_breaking(self, array):
    result = []
    for item in np.array(array).T:
      max_value = np.max(item)
      max_indices = np.where(item == max_value)[0]
      if len(max_indices) > 1:
        print(f"Tie detected for values: {item[max_indices]} at indices: {max_indices}")
      result.append(np.random.choice(max_indices))
    return np.array(result)

  def fit(self, X, y, sample_weight=None):
    #random.seed(35)
    pi_map = []
    theta_map = []
    N = self.N
    for i in range(N):
        X_batch, y_batch = self._training_subset(X, y)
        pi_map_temp, theta_map_temp = self._map_pi_theta(X_batch, y_batch)
        pi_map.append(pi_map_temp)
        theta_map.append(theta_map_temp)

    self.pi_map = np.mean(pi_map, axis=0)
    self.theta_map = np.mean(theta_map, axis=0)
    return self

  def predict(self, X, random=True):
    N = self.N
    pi = self.pi_map
    theta = self.theta_map
    y_temp = self._single_prediction(X, pi, theta, random)

    y_map = ["Shawarma" if x==0 else "Pizza" if x==1 else "Sushi" for x in y_temp]
    return np.array(y_map)

  def get_params(self, deep=True):
    return {"a": self.a, "b": self.b, "N": self.N, "split": self.split}

  def set_params(self, **parameters):
    for parameter, value in parameters.items():
      setattr(self, parameter, value)
    return self

Voting Classifier takes in a list of models and returns the majority vote when
making predictions.

In [None]:
class VotingClassifier2():
  def __init__(self, estimators, voting) -> None:
      self.models = {}
      self.num_models = 0
      for item in estimators:
        self.models[item[0]] = item[1]
        self.num_models += 1

  def fit(self, X, y):
    for item in self.models:
      self.models[item].fit(X, y)
    return self

  def argmax_with_tie_breaking(self, array):
    result = []
    for item in np.array(array).T:
      max_value = np.max(item)
      max_indices = np.where(item == max_value)[0]
      result.append(np.random.choice(max_indices))
    return np.array(result)

  def predict(self, X, random=True):
    predictions = []
    y_map_count = np.zeros([3, X.shape[0]])
    for key in self.models:
        model = self.models[key]
        y_temp = model.predict(X)
        for i in range(y_temp.shape[0]):
          value = y_temp[i]
          if value == "Pizza":
            y_map_count[0][i] += 1
          if value == "Sushi":
            y_map_count[1][i] += 1
          if value == "Shawarma":
            y_map_count[2][i] += 1
    if random:
      y_map = self.argmax_with_tie_breaking(y_map_count)
    else:
      y_map = np.argmax(y_map_count, axis = 0)
    y_map = ["Pizza" if x==0 else "Sushi" if x==1 else "Shawarma" for x in y_map]
    return np.array(y_map)

Return the accuracy of predictions.

In [None]:
def accuracy(y, t):
    return np.mean(y == t)

# Putting it all together

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

# 1 Load Data
df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

# 2 Separate Features and Labels
X = df.drop(columns=["Label"])
y = df["Label"]

# 3 Identify Numeric and Text Columns
numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

# 4 Apply TF-IDF Transformation
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

# 5 Min-Max Scaling for Numeric Features
if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_temp = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_temp = df_tfidf

In [None]:
k_best = SelectKBest(chi2, k=500)
X_final = k_best.fit_transform(X_temp, y)
#X_final = select_k_best(X_temp, y, 750)
#X_final = np.array(X_temp)

# 7 Train-Test Split (Stratified Split)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# 8 Hyperparameter Tuning for Naive Bayes
'''
param_grid_nb = {'alpha': np.linspace(0.1, 1.0, 10)}  # Testing alpha values between 0.1 and 1.0
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring="accuracy")
grid_search_nb.fit(X_train, y_train)

best_alpha = grid_search_nb.best_params_['alpha']
print(f"Best Alpha Found for Naive Bayes: {best_alpha}")

split=90, n_iter=100'''
param_grid_nb = {'split': np.linspace(10, 90, 9), 'N': [25, 50, 75, 100, 200, 250, 500]}
grid_search_nb = GridSearchCV(NaiveBayes(), param_grid_nb, cv=5, scoring="accuracy")
grid_search_nb.fit(X_train, y_train)

best_nb = grid_search_nb.best_params_
print(f"Best Params Found for Naive Bayes: {best_nb}")

Best Params Found for Naive Bayes: {'N': 75, 'split': np.float64(90.0)}


In [None]:
# 9 Hyperparameter Tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring="accuracy")
grid_search_rf.fit(X_train, y_train)

best_rf_params = grid_search_rf.best_params_
print(f"Best Random Forest Parameters: {best_rf_params}")

# 10 Hyperparameter Tuning for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring="accuracy")
grid_search_lr.fit(X_train, y_train)

best_lr_params = grid_search_lr.best_params_
print(f"Best Logistic Regression Parameters: {best_lr_params}")

Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}




Best Logistic Regression Parameters: {'C': 10, 'solver': 'liblinear'}




In [None]:
# 11 Train Models with Best Hyperparameters
# nb_model = MultinomialNB(alpha=best_alpha)
nb_model = NaiveBayes(split=90, N=50)

In [None]:
# 12 Create Voting Classifier with Naive Bayes, Random Forest, and Logistic Regression
nb_model.fit(X_train, y_train)

# 13 Make Predictions
y_train_pred = nb_model.predict(X_train, random=True)
y_test_pred = nb_model.predict(X_test, random=True)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_test_pred)
print(cm)


# 13 Make Predictions
y_train_pred = nb_model.predict(X_train, random=False)
y_test_pred = nb_model.predict(X_test, random=False)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

cm = confusion_matrix(y_test, y_test_pred)
print(cm)



Training Accuracy: 0.9035
Testing Accuracy: 0.8947

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.85      0.95      0.90       164
    Shawarma       0.90      0.90      0.90       165
       Sushi       0.94      0.84      0.89       165

    accuracy                           0.89       494
   macro avg       0.90      0.89      0.89       494
weighted avg       0.90      0.89      0.89       494

[[155   6   3]
 [ 11 148   6]
 [ 16  10 139]]

Training Accuracy: 0.9035
Testing Accuracy: 0.8947

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.85      0.95      0.90       164
    Shawarma       0.90      0.90      0.90       165
       Sushi       0.94      0.84      0.89       165

    accuracy                           0.89       494
   macro avg       0.90      0.89      0.89       494
weighted avg       0.90      0.89      0.89       494

[[155   6   3]
 [ 

In [None]:
rf_model = RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
                                  max_depth=best_rf_params['max_depth'],
                                  min_samples_split=best_rf_params['min_samples_split'],
                                  random_state=42)
lr_model = LogisticRegression(C=best_lr_params['C'], solver=best_lr_params['solver'])

In [None]:
# 12 Create Voting Classifier with Naive Bayes, Random Forest, and Logistic Regression
voting_model = VotingClassifier2(estimators=[('nb', nb_model), ('rf', rf_model), ('lr', lr_model)], voting='hard')
voting_model.fit(X_train, y_train)

# 13 Make Predictions
y_train_pred = voting_model.predict(X_train)
y_test_pred = voting_model.predict(X_test)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(y_test)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

213        Pizza
745        Pizza
758        Pizza
216        Pizza
1127    Shawarma
          ...   
1091    Shawarma
1080    Shawarma
121        Pizza
532        Sushi
1432       Sushi
Name: Label, Length: 494, dtype: object

Training Accuracy: 0.9678
Testing Accuracy: 0.9089

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.88      0.95      0.91       164
    Shawarma       0.92      0.90      0.91       165
       Sushi       0.94      0.88      0.91       165

    accuracy                           0.91       494
   macro avg       0.91      0.91      0.91       494
weighted avg       0.91      0.91      0.91       494



In [None]:
from NaiveBayes import NaiveBayesClassifier

nb_model = NaiveBayesClassifier(split=90, N=50)

nb_model.fit(X_train, y_train)

# 13 Make Predictions
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9035
Testing Accuracy: 0.8927

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.86      0.94      0.90       164
    Shawarma       0.90      0.90      0.90       165
       Sushi       0.93      0.84      0.89       165

    accuracy                           0.89       494
   macro avg       0.90      0.89      0.89       494
weighted avg       0.90      0.89      0.89       494



In [None]:
nb_model.save("nb_pretrained.pkl")

Success! Naive Bayes exported to nb_pretrained.pkl.


In [None]:
new_nb = NaiveBayesClassifier()
new_nb.load_pretrained("nb_pretrained.pkl")

# 13 Make Predictions
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

Success! Pre-trained Naive Bayes loaded from nb_pretrained.pkl.

Training Accuracy: 0.9035
Testing Accuracy: 0.8927

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.86      0.94      0.90       164
    Shawarma       0.90      0.90      0.90       165
       Sushi       0.93      0.84      0.89       165

    accuracy                           0.89       494
   macro avg       0.90      0.89      0.89       494
weighted avg       0.90      0.89      0.89       494



In [None]:
from Voting import VotingClassifier as Voting

# 12 Create Voting Classifier with Naive Bayes, Random Forest, and Logistic Regression
voting_model = Voting(estimators=[('nb', new_nb), ('rf', rf_model), ('lr', lr_model)], voting='hard')
voting_model.fit(X_train, y_train)

# 13 Make Predictions
y_train_pred = voting_model.predict(X_train)
y_test_pred = voting_model.predict(X_test)

# 14 Evaluate Performance
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9687
Testing Accuracy: 0.9109

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.87      0.96      0.92       164
    Shawarma       0.92      0.89      0.91       165
       Sushi       0.94      0.88      0.91       165

    accuracy                           0.91       494
   macro avg       0.91      0.91      0.91       494
weighted avg       0.91      0.91      0.91       494

