<a href="https://colab.research.google.com/github/akatjizeu/Data-Science/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')



In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3493 entries, 0 to 3492
Data columns (total 34 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Registration Number            3493 non-null   int64  
 1   Annual Turnover                3493 non-null   int64  
 2   Cuisine                        3493 non-null   object 
 3   City                           3493 non-null   object 
 4   Restaurant Location            3493 non-null   object 
 5   Opening Day of Restaurant      3493 non-null   object 
 6   Facebook Popularity Quotient   3394 non-null   float64
 7   Endorsed By                    3493 non-null   object 
 8   Instagram Popularity Quotient  3437 non-null   float64
 9   Fire Audit                     3493 non-null   int64  
 10  Liquor License Obtained        3493 non-null   int64  
 11  Situated in a Multi Complex    3493 non-null   int64  
 12  Dedicated Parking              3493 non-null   i

In [6]:
# Separate the target variable
X_train = train_data.drop('Annual Turnover', axis=1)  # Assuming 'Turnover' is the target variable
y_train = train_data['Annual Turnover']
X_test = test_data.copy()

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data: Imputation and scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())                  # Feature scaling
])

# Preprocessing for categorical data: Imputation and one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encoding
])

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
# Lasso Regression pipeline
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('lasso', Lasso())])

# Hyperparameter tuning for Lasso
lasso_param_grid = {
    'lasso__alpha': [0.01, 0.1, 1, 10, 100]  # L1 penalty values
}

# Use cross-validation to find the best hyperparameters
lasso_search = GridSearchCV(lasso_pipeline, lasso_param_grid, cv=5, scoring='neg_root_mean_squared_error')
lasso_search.fit(X_train, y_train)

# Best parameters and RMSE for Lasso
best_lasso_params = lasso_search.best_params_
lasso_rmse = -lasso_search.best_score_

print("Best Lasso Parameters:", best_lasso_params)
print("Best Lasso RMSE:", lasso_rmse)


Best Lasso Parameters: {'lasso__alpha': 100}
Best Lasso RMSE: 24285159.328400295


In [None]:
# Random Forest pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('rf', RandomForestRegressor(random_state=42))])

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'rf__n_estimators': [50,100, 200, 300],
    'rf__max_depth': [5,10, 20, 30],
    'rf__min_samples_split': [2, 5, 10,15],
    'rf__min_samples_leaf': [1, 2, 4, 8]
}

# Use cross-validation to find the best hyperparameters
rf_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='neg_root_mean_squared_error')
rf_search.fit(X_train, y_train)

# Best parameters and RMSE for Random Forest
best_rf_params = rf_search.best_params_
rf_rmse = -rf_search.best_score_

print("Best Random Forest Parameters:", best_rf_params)
print("Best Random Forest RMSE:", rf_rmse)
