In [1]:
#Task 3: This is the third task of the Machine Learning internship project provided by the company Cognifyz. 
**Github: ** https://github.com/alinakhan92/Restaurant-Rating-Prediction-Recommendation---cuisine-classification

**June-july 2024**

Objective: Develop a machine learning model to classify restaurants based on their cuisines.

Steps:

Preprocess the dataset by handling missing values and encoding categorical variables.
Split the data into training and testing sets.
Select a classification algorithm (e.g., logistic regression, random forest) and train it on the training data.
Evaluate the model's performance using appropriate classification metrics (e.g., accuracy, precision, recall) on the testing data.
Analyze the model's performance across different cuisines and identify any challenges or biases.

In [2]:
###Import Libraries and Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# mount google drive
# load dataframe
file_path = 'Dataset .csv'
df = pd.read_csv('Dataset .csv')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [4]:
###Data Preprocessing and Splitting

In [5]:
# removing features that will inhibit model training
df.drop('Restaurant ID', axis=1, inplace=True)
df.drop('Country Code', axis=1, inplace=True)
df.drop('City', axis=1, inplace=True)
df.drop('Address', axis=1, inplace=True)
df.drop('Locality', axis=1, inplace=True)
df.drop('Locality Verbose', axis=1, inplace=True)
df.drop('Longitude', axis=1, inplace=True)
df.drop('Latitude', axis=1, inplace=True)
df.drop('Currency', axis=1, inplace=True)
df.drop('Has Table booking', axis=1, inplace=True)
df.drop('Has Online delivery', axis=1, inplace=True)
df.drop('Is delivering now', axis=1, inplace=True)
df.drop('Switch to order menu', axis=1, inplace=True)
df.drop('Price range', axis=1, inplace=True)
df.drop('Aggregate rating', axis=1, inplace=True)
df.drop('Rating color', axis=1, inplace=True)
df.drop('Rating text', axis=1, inplace=True)
df.drop('Votes', axis=1, inplace=True)

In [6]:
#handle missing values
df.isna().sum()

Restaurant Name         0
Cuisines                9
Average Cost for two    0
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.shape

(9542, 3)

In [9]:
df.describe(include="all")

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
count,9542,9542,9542.0
unique,7437,1825,
top,Cafe Coffee Day,North Indian,
freq,83,936,
mean,,,1200.326137
std,,,16128.743876
min,,,0.0
25%,,,250.0
50%,,,400.0
75%,,,700.0


In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Restaurant Name'] = label_encoder.fit_transform(df['Restaurant Name'])
df['Cuisines'] = label_encoder.fit_transform(df['Cuisines'])
df

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
0,3742,920,1100
1,3167,1111,1200
2,2892,1671,4000
3,4700,1126,1500
4,5515,1122,1500
...,...,...,...
9546,4436,1813,80
9547,1310,1824,105
9548,3063,1110,170
9549,512,1657,120


In [11]:
x = df.drop('Cuisines',axis=1)
y = df['Cuisines']

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x=scaler.fit_transform(x)

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=15)

In [14]:
###Random Forest Model

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
model_rfc = RandomForestClassifier(n_estimators=100, random_state=42)

model_rfc.fit(x_train, y_train)

rfc_pred = model_rfc.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [18]:
accuracy = accuracy_score(y_test, rfc_pred)
print(f"Accuracy: {accuracy:.2f}")

# Precision, recall, F1-score
precision = precision_score(y_test, rfc_pred, average='micro')
recall = recall_score(y_test, rfc_pred, average='micro')
f1 = f1_score(y_test, rfc_pred, average='micro')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.23
Precision: 0.23
Recall: 0.23
F1-score: 0.23


In [19]:
#cfm = confusion_matrix(y_test, rfc_pred)
#print(cfm)

In [20]:
###Logistic Regression Model

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
classifier_logreg = LogisticRegression(multi_class="multinomial")
classifier_logreg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
logreg_pred = classifier_logreg.predict(x_test)
print(logreg_pred)

[1306 1306 1306 ... 1306 1306 1306]


In [24]:
accuracy = accuracy_score(y_test, logreg_pred)
print(f"Accuracy: {accuracy:.2f}")

# Precision, recall, F1-score
precision = precision_score(y_test, logreg_pred, average='micro')
recall = recall_score(y_test, logreg_pred, average='micro')
f1 = f1_score(y_test, logreg_pred, average='micro')
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.10
Precision: 0.10
Recall: 0.10
F1-score: 0.10


In [25]:
##Conclusion:

In [26]:
#* On comparison, we can conclude that Random Forest performs better on our model than logistic regression.
#* Despite repeatedly trying my best on preprocessing and model selection, model performance could not be elevated beyond the current accuracy score.
#* This might be because of some underlying biases either in the model training or the dataset itself.
