# IMPORTING NECESSARY LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn import set_config
set_config(display='diagram')
from sklearn.preprocessing import StandardScaler


# PRE-PROCESSING ON DATASETS

In [2]:
df1 = pd.read_csv('Big Basket Food Delivery.csv', encoding='ISO-8859-1')
df2 = pd.read_excel('Country-Code.xlsx')

df = pd.merge(df1, df2, on='Country Code')

df.to_csv('merged_file.csv')

df = df.drop(["Restaurant Name", "City", "Locality", "Locality Verbose", "Switch to order menu", "Rating color", "Rating text"], axis=1)

df.isnull().sum()

# Create a pipeline
pipeline1 = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan))])
# Apply the pipeline to the 'Cuisines' column
df['Cuisines'] = pipeline1.fit_transform(df[['Cuisines']])


# Replace the '?' values with NaN
df = df.replace(['?', '*', '&'], np.nan)

pipeline2 = Pipeline([
    ("replace_non_numeric", SimpleImputer(strategy="constant", fill_value=np.nan)),
    ("impute_avg_cost", SimpleImputer(strategy="median", missing_values=np.nan)),
    ("impute_price_range", SimpleImputer(strategy="median", missing_values=-1)),
])

df[["Average Cost for two", "Price range"]] = pipeline2.fit_transform(df[["Average Cost for two", "Price range"]])

df.isnull().sum()


Restaurant ID           0
Country Code            0
Address                 0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Price range             0
Aggregate rating        0
Votes                   0
Country                 0
dtype: int64

In [3]:
pipeline1

In [4]:
pipeline2

# MAPPING CATEGORICAL COLUMNS

In [5]:
# create a dictionary to map the categorical variables
cat_mapping = {
    "Yes": 1,
    "No": 0
}

# Define the pipeline
def map_cats(X):
    return np.vectorize(cat_mapping.get)(X)

pipeline3 = Pipeline([
    ("map_cats", FunctionTransformer(map_cats)),
])

# Apply the pipeline to the dataframe
df[["Has Table booking", "Has Online delivery", "Is delivering now"]] = pipeline3.fit_transform(df[["Has Table booking", "Has Online delivery", "Is delivering now"]])


In [6]:
pipeline3

# CONVERTING REMAINING CATEGORICAL COLUMNS TO NUMERICAL COLUMNS

In [7]:
# Define the functions to factorize columns
factorize_columns = FunctionTransformer(lambda X: pd.DataFrame({
    "Address": pd.factorize(X["Address"])[0],
    "Cuisines": pd.factorize(X["Cuisines"])[0],
    "Currency": pd.factorize(X["Currency"])[0],
    "Country": pd.factorize(X["Country"])[0]
}, columns=["Address", "Cuisines", "Currency", "Country"]))

# Define the pipeline
pipeline4 = Pipeline([    ("factorize_columns", factorize_columns)])

# Apply the pipeline to the dataframe
df[["Address", "Cuisines", "Currency", "Country"]] = pipeline4.fit_transform(df)

In [8]:
pipeline4

# OUTLIERS TREATEMENT

In [9]:
def cap_data(df):
    for col in df.columns:
        print("capping the", col)
        if df[col].dtype == 'float64' or df[col].dtype == 'int64':
            percentiles = df[col].quantile([0.20, 0.80]).values
            df.loc[df[col] <= percentiles[0], col] = percentiles[0]
            df.loc[df[col] >= percentiles[1], col] = percentiles[1]
        else:
            df[col] = df[col]
    return df

pipeline5 = Pipeline([
    ("cap_data", FunctionTransformer(cap_data)),
])

df1 = pipeline5.fit_transform(df)

capping the Restaurant ID
capping the Country Code
capping the Address
capping the Longitude
capping the Latitude
capping the Cuisines
capping the Average Cost for two
capping the Currency
capping the Has Table booking
capping the Has Online delivery
capping the Is delivering now
capping the Price range
capping the Aggregate rating
capping the Votes
capping the Country


In [10]:
pipeline5

# CREATING PREPROCESSOR PIPELINE

In [11]:
from sklearn.pipeline import make_pipeline

preprocessor = make_pipeline(
    pipeline1,
    pipeline2,
    pipeline3,
    pipeline4,
    pipeline5,
    StandardScaler()
)


In [12]:
preprocessor

# INPUT OUTPUT COLUMN SEAPARATION

In [13]:
#splitting data into independent and dependant or target variables
X = df1.drop(['Aggregate rating'], axis=1)
y = df1['Aggregate rating']

# TRAIN TEST SPLIT

In [14]:
#from sklearn.model_selection import train_test_split
# assuming X is your feature data and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [15]:
knn = KNeighborsRegressor(n_neighbors = 7)
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor(n_neighbors = 7))
])


In [16]:
knn_pipeline

In [17]:
#fitting the model
knn.fit(X_train,y_train)

In [18]:
y_pred = knn.predict(X_test)
y_pred

array([2.74285714, 1.35714286, 3.2       , ..., 1.87142857, 3.01428571,
       3.8       ])

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# Calculate RMSE
rmse = sqrt(mse)
print("RMSE:", rmse)


MSE: 1.2870737620008756
RMSE: 1.1344927333398287
