In [12]:
import pandas as pd

# Load the training and test datasets
train_data = pd.read_excel("https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Restaurant%20Food%20Cost/Data_Train.xlsx?raw=true")
test_data = pd.read_excel("https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Restaurant%20Food%20Cost/Data_Test.xlsx?raw=true")

# Display the first few rows of the training data
print("Training Data:")
print(train_data.head())

# Display the first few rows of the test data
print("\nTest Data:")
print(test_data.head())


Training Data:
               TITLE  RESTAURANT_ID  \
0      CASUAL DINING           9438   
1  CASUAL DINING,BAR          13198   
2      CASUAL DINING          10915   
3        QUICK BITES           6346   
4     DESSERT PARLOR          15387   

                                     CUISINES  \
0                 Malwani, Goan, North Indian   
1              Asian, Modern Indian, Japanese   
2  North Indian, Chinese, Biryani, Hyderabadi   
3                            Tibetan, Chinese   
4                                    Desserts   

                                     TIME     CITY        LOCALITY RATING  \
0  11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)    Thane  Dombivali East    3.6   
1                    6pm – 11pm (Mon-Sun)  Chennai       Ramapuram    4.2   
2     11am – 3:30pm, 7pm – 11pm (Mon-Sun)  Chennai      Saligramam    3.8   
3                 11:30am – 1am (Mon-Sun)   Mumbai     Bandra West    4.1   
4                    11am – 1am (Mon-Sun)   Mumbai     Lower Parel    

In [13]:
# Check for missing values in the training data
print("\nMissing values in training data:")
print(train_data.isnull().sum())

# Check for missing values in the test data
print("\nMissing values in test data:")
print(test_data.isnull().sum())

# Explore basic statistics of numerical features
print("\nBasic statistics of numerical features:")
print(train_data.describe())

# Explore unique values of categorical features
print("\nUnique values of categorical features:")
for column in train_data.select_dtypes(include=['object']).columns:
    print(column, ":", train_data[column].nunique())



Missing values in training data:
TITLE            1003
RESTAURANT_ID       0
CUISINES            0
TIME                0
CITY              112
LOCALITY           98
RATING              2
VOTES            1204
COST                0
dtype: int64

Missing values in test data:
TITLE            284
RESTAURANT_ID      0
CUISINES           0
TIME               0
CITY              35
LOCALITY          30
RATING             2
VOTES            402
dtype: int64

Basic statistics of numerical features:
       RESTAURANT_ID          COST
count   12690.000000  12690.000000
mean     7759.134121    655.252246
std      4504.874150    627.003540
min         0.000000     20.000000
25%      3863.250000    300.000000
50%      7740.000000    500.000000
75%     11688.750000    800.000000
max     15573.000000  14000.000000

Unique values of categorical features:
TITLE : 112
CUISINES : 4155
TIME : 2689
CITY : 359
LOCALITY : 1416
RATING : 32
VOTES : 1847


In [14]:
# Fill missing values in the 'RATING' and 'VOTES' columns with appropriate values
train_data['RATING'].fillna(train_data['RATING'].mode()[0], inplace=True)
test_data['RATING'].fillna(test_data['RATING'].mode()[0], inplace=True)

train_data['VOTES'].fillna('0 votes', inplace=True)
test_data['VOTES'].fillna('0 votes', inplace=True)

# Extract numerical value from 'VOTES' column
train_data['VOTES'] = train_data['VOTES'].str.extract('(\d+)')
test_data['VOTES'] = test_data['VOTES'].str.extract('(\d+)')

# Convert 'VOTES' column to numeric
train_data['VOTES'] = pd.to_numeric(train_data['VOTES'])
test_data['VOTES'] = pd.to_numeric(test_data['VOTES'])

# Encode categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Separate features and target variable in the training data
X_train = train_data.drop('COST', axis=1)
y_train = train_data['COST']

# Separate features in the test data
X_test = test_data


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Train the model
rf_regressor.fit(X_train, y_train)


In [None]:
# Make predictions on the test data
predictions = rf_regressor.predict(X_test)

# Convert predictions to DataFrame
output_df = pd.DataFrame({'COST': predictions})

# Save predictions to CSV file
output_df.to_csv("restaurant_food_cost_predictions.csv", index=False)

print("Predictions saved to restaurant_food_cost_predictions.csv")
