<a href="https://colab.research.google.com/github/atharva14-svg/AquaWatch-Insights/blob/main/Restaurant_Rating_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
print("Importing the dataset from 'Dataset .csv'.")
try:
    df = pd.read_csv("Dataset .csv")
except FileNotFoundError:
    print("Error: 'Dataset .csv' not found. Using simulated data for demonstration.")
    data = {
        'Cuisines': ['Italian', 'Mexican', 'Indian', 'Italian', 'Mexican', 'Indian', 'Italian', 'Mexican', 'Indian'],
        'Average Cost for two': [500, 300, 800, 600, 450, 900, 700, 350, 750],
        'Has Online delivery': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
        'Has Table booking': ['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
        'Votes': [150, 80, 250, 200, 120, 300, 180, 90, 270],
        'Aggregate rating': [4.5, 3.2, 4.8, 4.2, 3.5, 4.9, 4.6, 3.4, 4.7]
    }
    df = pd.DataFrame(data)

In [None]:
print("\nPreprocessing the dataset.")
features = ['Average Cost for two', 'Votes'] # Keep only numerical features in the 'features' list
target = 'Aggregate rating'

# Handle missing values before one-hot encoding
cols_to_check_for_nulls = ['Average Cost for two', 'Votes', 'Has Online delivery', 'Has Table booking', 'Cuisines', 'Aggregate rating']
if all(col in df.columns for col in cols_to_check_for_nulls) and df[cols_to_check_for_nulls].isnull().sum().any():
    print("Warning: Missing values detected. Handling them now.")
    for col in ['Average Cost for two', 'Votes']:
        if col in df.columns:
            df[col].fillna(df[col].mean(), inplace=True)
    for col in ['Has Online delivery', 'Has Table booking']:
        if col in df.columns:
            df[col].fillna('No', inplace=True)
    if 'Cuisines' in df.columns:
        df['Cuisines'].fillna('Unknown', inplace=True)
    if 'Aggregate rating' in df.columns:
        df['Aggregate rating'].fillna(df['Aggregate rating'].mean(), inplace=True)


categorical_features = ['Cuisines', 'Has Online delivery', 'Has Table booking']
# Ensure categorical features exist before attempting get_dummies
categorical_features_existing = [col for col in categorical_features if col in df.columns]
if categorical_features_existing:
    df = pd.get_dummies(df, columns=categorical_features_existing, drop_first=True)

# Select features (X) and target (y) after one-hot encoding
# Include original numerical features and all dummy variables
X = df[features + [col for col in df.columns if col.startswith('Cuisines_') or col.startswith('Has Online delivery_') or col.startswith('Has Table booking_')]]
y = df[target]

# Ensure X and y have the same number of rows
if X.shape[0] != y.shape[0]:
    print("Error: Mismatch in number of rows between features (X) and target (y).")
else:
    print(f"Shape of features (X): {X.shape}")
    print(f"Shape of target (y): {y.shape}")

In [None]:
print("\nSplitting the data.")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of training data: {X_train.shape}")
print(f"Shape of testing data: {X_test.shape}")

In [None]:
print("\nTraining the machine learning model.")
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
print("\nEvaluating the model's performance.")
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

In [None]:
print("\nAnalyzing the most influential features.")
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
most_important_features = feature_importances.sort_values(ascending=False)

print("Top influential features affecting restaurant ratings:")
print(most_important_features)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
try:
    with open('/content/sample_data/README.md', 'r') as f:
        readme_content = f.read()
    print(readme_content)
except FileNotFoundError:
    print("Error: README.md not found in /content/sample_data/")

This directory includes a few sample datasets to get you started.

*   `california_housing_data*.csv` is California housing data from the 1990 US
    Census; more information is available at:
    https://docs.google.com/document/d/e/2PACX-1vRhYtsvc5eOR2FWNCwaBiKL6suIOrxJig8LcSBbmCbyYsayia_DvPOOBlXZ4CAlQ5nlDD8kTaIDRwrN/pub

*   `mnist_*.csv` is a small sample of the
    [MNIST database](https://en.wikipedia.org/wiki/MNIST_database), which is
    described at: http://yann.lecun.com/exdb/mnist/

*   `anscombe.json` contains a copy of
    [Anscombe's quartet](https://en.wikipedia.org/wiki/Anscombe%27s_quartet); it
    was originally described in

    Anscombe, F. J. (1973). 'Graphs in Statistical Analysis'. American
    Statistician. 27 (1): 17-21. JSTOR 2682899.

    and our copy was prepared by the
    [vega_datasets library](https://github.com/altair-viz/vega_datasets/blob/4f67bdaad10f45e3549984e17e1b3088c731503d/vega_datasets/_data/anscombe.json).

