In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import requests
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt 

warnings.filterwarnings('ignore')

# We will use the URLs for train and test datasets (directly from the link)
train_url = "https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Restaurant%20Food%20Cost/Data_Train.xlsx"
test_url = "https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Restaurant%20Food%20Cost/Data_Test.xlsx"

# This function will help us to read Excel data from URL
def read_excel_from_url(url):
    response = requests.get(url)
    data = response.content
    return pd.read_excel(BytesIO(data))

# This is to Import datasets
train = read_excel_from_url(train_url)
test = read_excel_from_url(test_url)

# Pre - processing of the  data here
def preprocess_data(df):
    # Fill missing values
    df.fillna('missing', inplace=True)
    return df

# To extract trhe Features of the data
def extract_features(df):
    # Just an example - here we could extract the number of cuisines
    df['NUM_CUISINES'] = df['CUISINES'].apply(lambda x: len(x.split(',')))
    # You should add more sophisticated feature extraction here
    return df

# Now, once done, we will preparing the data for Preprocess and extract features
train = preprocess_data(train)
test = preprocess_data(test)
train = extract_features(train)
test = extract_features(test)

# This is for Model Building
def build_model(X, y):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    return model

# Evaluation will be done now
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    rmse = sqrt(mse)
    return rmse

# Prepare data for modeling
# Note: We will use any info for simplicity.
X = train[['NUM_CUISINES']]  # This will change as per the user's choice
y = train['COST']  # Target variable

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building and evaluating the model
model = build_model(X_train, y_train)
rmse = evaluate_model(model, X_val, y_val)
print(f"Validation RMSE: {rmse}")

# Making the predictions on the test set
X_test = test[['NUM_CUISINES']]  # This should match the one chosen above
predictions = model.predict(X_test)
predictions_df = pd.DataFrame(predictions, columns=['Predicted_COST'])
print(predictions_df.head())

Validation RMSE: 578.5086505995472
   Predicted_COST
0      872.316418
1      872.316418
2      872.316418
3      525.528778
4      553.376345
