In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Assuming the dataset is in a CSV file
df = pd.read_csv(r'C:\Users\VSS\Desktop\ML projects\Dataset.csv')

# checking the data
df.head()

In [None]:
# informations about the data
df.describe()

In [None]:
# Analyze missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

print("Missing Values Count:")
print(missing_values)
print("\nMissing Values Percentage:")
print(missing_percentage)

In [None]:
# Fill missing cuisines with 'Unknown'
df['Cuisines'].fillna('Unknown', inplace=True)

In [47]:
# Encode categorical features
label_encoder = LabelEncoder()

# Encoding Cuisines as the target variable
df['Cuisines_encoded'] = label_encoder.fit_transform(df['Cuisines'])

# Encoding other categorical features: City, Currency, Has Table booking, Has Online delivery, etc.
df['City_encoded'] = label_encoder.fit_transform(df['City'])
df['Currency_encoded'] = label_encoder.fit_transform(df['Currency'])
df['Has Table booking_encoded'] = label_encoder.fit_transform(df['Has Table booking'])
df['Has Online delivery_encoded'] = label_encoder.fit_transform(df['Has Online delivery'])


In [48]:
# Selecting features for classification
X = df[['City_encoded', 'Currency_encoded', 'Has Table booking_encoded', 'Has Online delivery_encoded', 'Price range', 'Aggregate rating', 'Votes']]
y = df['Cuisines_encoded']

In [None]:
# Check for class imbalance
plt.figure(figsize=(10, 6))
sns.countplot(y=df['Cuisines'])
plt.title("Distribution of Cuisines")
plt.xticks(rotation=90)
plt.show()

In [50]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train the Random Forest classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)


In [16]:
# Make predictions on the test set
y_pred_rf = grid_search_rf.predict(X_test)


In [None]:
# Evaluate the Random Forest model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_)

print(f"Random Forest Accuracy: {accuracy_rf}")
print("Random Forest Classification Report:")
print(classification_rep_rf)

In [None]:
# Initialize and train the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train)


In [None]:
# Make predictions on the test set using XGBoost
y_pred_xgb = xgb_classifier.predict(X_test)

In [None]:
# Evaluate the XGBoost model's performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_)

print(f"XGBoost Accuracy: {accuracy_xgb}")
print("XGBoost Classification Report:")
print(classification_rep_xgb)