In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tabulate import tabulate


In [None]:
print("Loading dataset...")
df = pd.read_csv("/Dataset .csv")
print(f"Dataset loaded. Shape: {df.shape}")
print(df.head(3))


Loading dataset...
Dataset loaded. Shape: (9551, 21)
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   

                                    Locality Verbose   Longitude   Latitude  \
0  Century City Mall, Poblacion, Makati City, Mak...  121.027535  14.565443   
1  Little Tokyo, Legaspi Village, Makati City, Ma...  121.0141

In [None]:
print("\nPreprocessing data...")

# Drop rows where 'Cuisines' is missing
df = df.dropna(subset=['Cuisines'])

# Extract primary cuisine (first cuisine listed)
df['Primary_Cuisine'] = df['Cuisines'].apply(lambda x: x.split(',')[0].strip())

# Remove cuisines with very few samples
cuisine_counts = df['Primary_Cuisine'].value_counts()
df = df[df['Primary_Cuisine'].isin(cuisine_counts[cuisine_counts >= 2].index)]



Preprocessing data...


In [None]:
features = ['Country Code', 'City', 'Has Table booking', 'Has Online delivery',
            'Price range', 'Aggregate rating', 'Votes']

X = df[features].copy()
y = df['Primary_Cuisine']


In [None]:
# Fill missing numeric values with median
for col in X.select_dtypes(include=['int64', 'float64']).columns:
    X.loc[:, col] = X[col].fillna(X[col].median())

# One-hot encoding for categorical features
X = pd.get_dummies(X, columns=['City', 'Has Table booking', 'Has Online delivery'], drop_first=True)

# Label encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [None]:
print("\nTraining model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



Training model...


In [None]:
y_pred = model.predict(X_test)
acc = (y_pred == y_test).mean() * 100
print(f"Accuracy: {acc:.2f} %")


Accuracy: 25.46 %


In [None]:
print("\nClassification Report:")

# Get only present labels
present_labels = sorted(set(y_test) | set(y_pred))
present_names = le.inverse_transform(present_labels)

print(classification_report(
    y_test, y_pred,
    labels=present_labels,
    target_names=present_names,
    zero_division=0
))



Classification Report:
                   precision    recall  f1-score   support

          Afghani       0.00      0.00      0.00         1
         American       0.18      0.16      0.17        56
           Andhra       1.00      1.00      1.00         1
          Arabian       0.00      0.00      0.00         1
            Asian       0.00      0.00      0.00        15
         Assamese       0.00      0.00      0.00         1
           Awadhi       0.00      0.00      0.00         1
              BBQ       0.00      0.00      0.00         4
           Bakery       0.11      0.09      0.10       124
         Bar Food       0.00      0.00      0.00         2
          Bengali       0.00      0.00      0.00         4
        Beverages       0.00      0.00      0.00        16
           Bihari       0.00      0.00      0.00         1
          Biryani       0.09      0.05      0.06        22
        Brazilian       0.33      0.50      0.40         4
        Breakfast       0.00   

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=present_labels)

similarities = []
for i in range(len(present_labels)):
    for j in range(len(present_labels)):
        if i != j and cm[i, j] > 0:
            similarities.append({
                'Actual Cuisine': present_names[i],
                'Predicted Cuisine': present_names[j],
                'Misclassified Count': cm[i, j]
            })

# Sort and take top 10 misclassifications
similarities.sort(key=lambda x: x['Misclassified Count'], reverse=True)
top_similarities = similarities[:10]

print("\nMost Similar Cuisine Pairs:")
print(tabulate(
    [[s['Actual Cuisine'], s['Predicted Cuisine'], s['Misclassified Count']] for s in top_similarities],
    headers=["Actual Cuisine", "Predicted Cuisine", "Misclassified Count"],
    tablefmt="grid"
))



Most Similar Cuisine Pairs:
+------------------+---------------------+-----------------------+
| Actual Cuisine   | Predicted Cuisine   |   Misclassified Count |
| Chinese          | North Indian        |                    94 |
+------------------+---------------------+-----------------------+
| Fast Food        | North Indian        |                    67 |
+------------------+---------------------+-----------------------+
| Bakery           | North Indian        |                    65 |
+------------------+---------------------+-----------------------+
| Cafe             | North Indian        |                    50 |
+------------------+---------------------+-----------------------+
| Mughlai          | North Indian        |                    33 |
+------------------+---------------------+-----------------------+
| North Indian     | Chinese             |                    33 |
+------------------+---------------------+-----------------------+
| Mithai           | North Indian