Load the dataset into a Pandas DataFrame

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

file_path = "/content/drive/My Drive/cleaned_data_combined_modified.csv"
df = pd.read_csv(file_path)

df.head()


Mounted at /content/drive


Unnamed: 0,id,"Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",Q2: How many ingredients would you expect this food item to contain?,Q3: In what setting would you expect this food to be served? Please check all that apply,Q4: How much would you expect to pay for one serving of this food item?,Q5: What movie do you think of when thinking of this food item?,Q6: What drink would you pair with this food item?,"Q7: When you think about this food item, who does it remind you of?",Q8: How much hot sauce would you add to this food item?,Label
0,716549,3,6,"Week day lunch,At a party,Late night snack",5,Cloudy with a Chance of Meatballs,Coke,Friends,A little (mild),Pizza
1,715742,4,"bread, meet","Week day lunch,At a party,Late night snack",5$ for a large piece,All sort of american young boy movies,Coke,"Friends,Teachers,Strangers",,Pizza
2,727333,3,5,"Week day lunch,Week day dinner,Weekend lunch,W...",10dollar,action movie,cola,Friends,A moderate amount (medium),Pizza
3,606874,4,6-7,"Week day lunch,Week day dinner,Weekend lunch,W...",$3,Mamma Mia,Soda,"Siblings,Friends,Teachers",I will have some of this food item with my hot...,Pizza
4,505318,2,3 or more,"Week day lunch,Week day dinner,Weekend lunch,W...",$5,Cloudy with a chance of meatballs,Soda,"Siblings,Friends",A little (mild),Pizza


Clean Data with **Numercial Values** (Q1, Q2, Q4)

Clean Data for **Categorical Columns** (Q3, Q5, Q6, Q7, Q8)

BoW

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("/content/drive/My Drive/cleaned_data_combined_modified.csv")

def extract_numeric(value):
    if pd.isnull(value):
        return None
    value = str(value).strip().lower()
    value = re.sub(r'[^\d\.\-]', ' ', value)
    value = re.sub(r'\s+', ' ', value).strip()

    if '-' in value:
        numbers = [float(num) for num in value.split('-') if num.strip().isdigit()]
        if numbers:
            return sum(numbers) / len(numbers)

    match = re.search(r'\d+(\.\d+)?', value)
    return float(match.group()) if match else None

numerical_columns = [
    "Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",
    "Q2: How many ingredients would you expect this food item to contain?",
    "Q4: How much would you expect to pay for one serving of this food item?"
]

for col in numerical_columns:
    df[col] = df[col].apply(extract_numeric)

df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

text_cols = ["Q3: In what setting would you expect this food to be served? Please check all that apply",
             "Q5: What movie do you think of when thinking of this food item?",
             "Q6: What drink would you pair with this food item?",
             "Q7: When you think about this food item, who does it remind you of?"]

df[text_cols] = df[text_cols].fillna("none").astype(str).apply(lambda x: x.str.lower().str.strip())

vectorizer_q3 = CountVectorizer(binary=True)
vectorizer_q5 = CountVectorizer(binary=True)
vectorizer_q6 = CountVectorizer(binary=True)
vectorizer_q7 = CountVectorizer(binary=True)

Q3_bow = vectorizer_q3.fit_transform(df["Q3: In what setting would you expect this food to be served? Please check all that apply"])
Q5_bow = vectorizer_q5.fit_transform(df["Q5: What movie do you think of when thinking of this food item?"])
Q6_bow = vectorizer_q6.fit_transform(df["Q6: What drink would you pair with this food item?"])
Q7_bow = vectorizer_q7.fit_transform(df["Q7: When you think about this food item, who does it remind you of?"])

df_q3_bow = pd.DataFrame(Q3_bow.toarray(), columns=[f"Q3_{word}" for word in vectorizer_q3.get_feature_names_out()])
df_q5_bow = pd.DataFrame(Q5_bow.toarray(), columns=[f"Q5_{word}" for word in vectorizer_q5.get_feature_names_out()])
df_q6_bow = pd.DataFrame(Q6_bow.toarray(), columns=[f"Q6_{word}" for word in vectorizer_q6.get_feature_names_out()])
df_q7_bow = pd.DataFrame(Q7_bow.toarray(), columns=[f"Q7_{word}" for word in vectorizer_q7.get_feature_names_out()])

df = pd.concat([df, df_q3_bow, df_q5_bow, df_q6_bow, df_q7_bow], axis=1)

df.drop(columns=text_cols, inplace=True)

hot_sauce_map = {
    "A little (mild)": "Mild",
    "A moderate amount (medium)": "Medium",
    "A lot (hot)": "Hot",
    "I will have some of this food item with my hot sauce": "Medium"
}

df["Q8_cleaned"] = df["Q8: How much hot sauce would you add to this food item?"].map(hot_sauce_map)
df["Q8_cleaned"].fillna("None", inplace=True)
df = pd.get_dummies(df, columns=["Q8_cleaned"])
df.drop(columns=["Q8: How much hot sauce would you add to this food item?"], inplace=True)

df.to_csv("/content/drive/My Drive/cleaned_data_bow.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Q8_cleaned"].fillna("None", inplace=True)


**Softmax Regression!! **

Split the data into training and testing (70-30)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_final.csv")

X = df.drop(columns=["Label"])
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training Set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing Set: X_test={X_test.shape}, y_test={y_test.shape}")

softmax_model = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
softmax_model.fit(X_train_scaled, y_train)

y_train_pred = softmax_model.predict(X_train_scaled)
y_test_pred = softmax_model.predict(X_test_scaled)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Set: X_train=(1150, 1342), y_train=(1150,)
Testing Set: X_test=(494, 1342), y_test=(494,)




Training Accuracy: 0.9965
Testing Accuracy: 0.8623
Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.82      0.88      0.85       164
    Shawarma       0.87      0.87      0.87       165
       Sushi       0.90      0.83      0.86       165

    accuracy                           0.86       494
   macro avg       0.86      0.86      0.86       494
weighted avg       0.86      0.86      0.86       494



Naive Bayes

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")

df = df.dropna(subset=["Label"])

X = df.drop(columns=["Label"])
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

nb_model = MultinomialNB(alpha=0.4)
nb_model.fit(X_train, y_train)

y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9235
Testing Accuracy: 0.8887
Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.86      0.90      0.88       164
    Shawarma       0.86      0.90      0.88       165
       Sushi       0.95      0.87      0.91       165

    accuracy                           0.89       494
   macro avg       0.89      0.89      0.89       494
weighted avg       0.89      0.89      0.89       494



TF-IDF
- Converted BoW features into TF-IDF scores using TfidfTransformer()
- Scaled numerical features using MinMaxScaler()
- Merged TF-IDF and scaled numerical data into one dataset

Think of TF-IDF (Term Frequency - Inverse Document Frequency) as a way to measure how important a word is in a dataset. Instead of just counting words (like BoW), it weights words based on how unique they are across different data points

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

X = df.drop(columns=["Label"])
y = df["Label"]

numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

nb_model = MultinomialNB(alpha=0.4)
nb_model.fit(X_train, y_train)

y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9374
Testing Accuracy: 0.8927

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.86      0.92      0.89       164
    Shawarma       0.90      0.92      0.91       165
       Sushi       0.93      0.84      0.88       165

    accuracy                           0.89       494
   macro avg       0.89      0.89      0.89       494
weighted avg       0.89      0.89      0.89       494



MultinomialNB: Works well when each class has similar numbers of examples.

**ComplementNB**: Works well when classes are imbalanced (gives better weight to smaller classes).

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

X = df.drop(columns=["Label"])
y = df["Label"]

numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

nb_model = ComplementNB(alpha=0.4)
nb_model.fit(X_train, y_train)

y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9348
Testing Accuracy: 0.8785

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.84      0.88      0.86       164
    Shawarma       0.87      0.87      0.87       165
       Sushi       0.92      0.88      0.90       165

    accuracy                           0.88       494
   macro avg       0.88      0.88      0.88       494
weighted avg       0.88      0.88      0.88       494



**Feature Selection**

Chi-Square Feature Selection (SelectKBest)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

X = df.drop(columns=["Label"])
y = df["Label"]

numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

k_best = SelectKBest(chi2, k=300)
X_train_selected = k_best.fit_transform(X_train, y_train)
X_test_selected = k_best.transform(X_test)

nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train_selected, y_train)

y_train_pred = nb_model.predict(X_train_selected)
y_test_pred = nb_model.predict(X_test_selected)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9217
Testing Accuracy: 0.8927

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.86      0.93      0.89       164
    Shawarma       0.90      0.88      0.89       165
       Sushi       0.92      0.87      0.89       165

    accuracy                           0.89       494
   macro avg       0.89      0.89      0.89       494
weighted avg       0.89      0.89      0.89       494



Bigrams

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

X = df.drop(columns=["Label"])
y = df["Label"]

numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

X_text = X[bow_columns].apply(lambda row: " ".join(row.astype(str)), axis=1)

tfidf = TfidfVectorizer(ngram_range=(1, 4))
X_tfidf = tfidf.fit_transform(X_text)
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out(), index=X.index)

if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

nb_model = MultinomialNB(alpha=0.3)
nb_model.fit(X_train, y_train)

y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 0.9409
Testing Accuracy: 0.8907

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.87      0.91      0.89       164
    Shawarma       0.88      0.92      0.90       165
       Sushi       0.93      0.85      0.89       165

    accuracy                           0.89       494
   macro avg       0.89      0.89      0.89       494
weighted avg       0.89      0.89      0.89       494



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

# Load Data
df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

# Separate Features and Labels
X = df.drop(columns=["Label"])
y = df["Label"]

# Identify Numeric and Text Columns
numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

# Apply TF-IDF Transformation
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

# Min-Max Scaling for Numeric Features
if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

#Feature Selection: Select the top 700 features
k_best = SelectKBest(chi2, k=700)
X_final = k_best.fit_transform(X_final, y)

# Train-Test Split (Stratified Split)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

# Hyperparameter Tuning for Alpha (Fine-tuning with a wider range)
param_grid = {'alpha': np.linspace(0.1, 1.0, 10)}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']
print(f"Best Alpha Found: {best_alpha}")

# Train Naïve Bayes Model with Best Alpha
nb_model = MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train, y_train)

# Make Predictions
y_train_pred = nb_model.predict(X_train)
y_test_pred = nb_model.predict(X_test)

# Evaluate Performance
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Best Alpha Found: 0.1

Training Accuracy: 0.9357
Testing Accuracy: 0.9028

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.90      0.90      0.90       164
    Shawarma       0.89      0.91      0.90       165
       Sushi       0.92      0.90      0.91       165

    accuracy                           0.90       494
   macro avg       0.90      0.90      0.90       494
weighted avg       0.90      0.90      0.90       494



**Multiple Models**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

# 1 Load Data
df = pd.read_csv("/content/drive/My Drive/cleaned_data_bow.csv")
df = df.dropna(subset=["Label"])

# 2 Separate Features and Labels
X = df.drop(columns=["Label"])
y = df["Label"]

# 3 Identify Numeric and Text Columns
numeric_columns = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]
bow_columns = [col for col in X.columns if col not in numeric_columns]

# 4 Apply TF-IDF Transformation
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X[bow_columns])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=bow_columns, index=X.index)

# 5 Min-Max Scaling for Numeric Features
if numeric_columns:
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_columns]), columns=numeric_columns, index=X.index)
    X_final = pd.concat([df_tfidf, df_scaled], axis=1)
else:
    X_final = df_tfidf

# 6 Feature Selection: Select the top 500 features
k_best = SelectKBest(chi2, k=500)
X_final = k_best.fit_transform(X_final, y)

# 7 Train-Test Split (Stratified Split)
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=42, stratify=y)

# 8 Hyperparameter Tuning for Naive Bayes (Fine-tuning alpha)
param_grid_nb = {'alpha': np.linspace(0.1, 1.0, 10)}  # Testing alpha values between 0.1 and 1.0
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring="accuracy")
grid_search_nb.fit(X_train, y_train)

best_alpha = grid_search_nb.best_params_['alpha']
print(f"Best Alpha Found for Naive Bayes: {best_alpha}")

# 9 Hyperparameter Tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring="accuracy")
grid_search_rf.fit(X_train, y_train)

best_rf_params = grid_search_rf.best_params_
print(f"Best Random Forest Parameters: {best_rf_params}")

# 10 Hyperparameter Tuning for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring="accuracy")
grid_search_lr.fit(X_train, y_train)

best_lr_params = grid_search_lr.best_params_
print(f"Best Logistic Regression Parameters: {best_lr_params}")

# 11 Train Models with Best Hyperparameters
nb_model = MultinomialNB(alpha=best_alpha)
rf_model = RandomForestClassifier(n_estimators=best_rf_params['n_estimators'],
                                  max_depth=best_rf_params['max_depth'],
                                  min_samples_split=best_rf_params['min_samples_split'],
                                  random_state=42)
lr_model = LogisticRegression(C=best_lr_params['C'], solver=best_lr_params['solver'])

# 12 Create Voting Classifier with Naive Bayes, Random Forest, and Logistic Regression
voting_model = VotingClassifier(estimators=[('nb', nb_model), ('rf', rf_model), ('lr', lr_model)], voting='hard')
voting_model.fit(X_train, y_train)

# 13 Make Predictions
y_train_pred = voting_model.predict(X_train)
y_test_pred = voting_model.predict(X_test)

# 14 Evaluate Performance
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Best Alpha Found for Naive Bayes: 0.1
Best Random Forest Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}




Best Logistic Regression Parameters: {'C': 10, 'solver': 'liblinear'}

Training Accuracy: 0.9626
Testing Accuracy: 0.9211

Classification Report (Test Set):
              precision    recall  f1-score   support

       Pizza       0.89      0.98      0.93       164
    Shawarma       0.93      0.90      0.91       165
       Sushi       0.95      0.89      0.92       165

    accuracy                           0.92       494
   macro avg       0.92      0.92      0.92       494
weighted avg       0.92      0.92      0.92       494

