In [1]:
# Import required libraries/packages
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Import warning to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load tips dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
# Define features and target
X = df.drop('tip',axis=1)
y = df['tip']
X.shape, y.shape

((244, 6), (244,))

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Identify categorical and numerical features
cat_features = X.select_dtypes(include='category').columns.tolist()
num_features = X.select_dtypes(exclude='category').columns.tolist()
cat_features,num_features

(['sex', 'smoker', 'day', 'time'], ['total_bill', 'size'])

In [8]:
# Preprocessing for numerical data
numerical_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [9]:
numerical_processor

In [10]:
# Preprocessing for categorical data
categorical_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
categorical_processor

In [12]:
# Combine preprocessing
preprocessor = ColumnTransformer([
    ('num', numerical_processor, num_features),
    ('cat', categorical_processor, cat_features)
])

In [13]:
preprocessor

In [14]:
# Define the pipeline with RandomForestRegressor
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [15]:
pipe

In [16]:
# Fit the model
pipe.fit(X_train, y_train)

In [17]:
# Predict and evaluate
y_pred = pipe.predict(X_test)
performance_before = {
    'MAE': mean_absolute_error(y_test, y_pred),
    'MSE': mean_squared_error(y_test, y_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
    'R2': r2_score(y_test, y_pred)
}

errors = abs(y_pred - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape

print("Accuracy:",accuracy)


Accuracy: 68.17979665958615


## Hyperparameter Tuning

In [19]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# # Number of features to consider at every split
# max_features = ['auto', 'sqrt','log2']

# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(4, 110, num = 11)]
# max_depth.append(None)

# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]

# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]

# # Method of selecting samples for training each tree
# bootstrap = [True, False]


# param_dist = {
#     'regressor__n_estimators':n_estimators,
#     'regressor__max_features':max_features,
#     'regressor__max_depth':max_depth,
#     'regressor__min_samples_leaf':min_samples_leaf,
#     'regressor__min_samples_split':min_samples_split,
#     'regressor__bootstrap': bootstrap
# }
# param_dist

# Set up hyperparameter search space
param_dist = {
    'regressor__n_estimators': [100, 200, 300],  # Narrower range
    'regressor__max_features': ['auto', 'sqrt'],  # Fewer options
    'regressor__max_depth': [None, 10, 20],  # Simpler choices
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__bootstrap': [True]
}
param_dist

{'regressor__n_estimators': [100, 200, 300],
 'regressor__max_features': ['auto', 'sqrt'],
 'regressor__max_depth': [None, 10, 20],
 'regressor__min_samples_split': [2, 5],
 'regressor__min_samples_leaf': [1, 2],
 'regressor__bootstrap': [True]}

In [20]:
# RandomizedSearchCV with cross-validation and simpler parameters
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=1, random_state=42)

In [21]:
# Fit the optimized pipeline
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [22]:
# Predict and evaluate
y_pred_opt = random_search.predict(X_test)
performance_after = {
    'MAE': mean_absolute_error(y_test, y_pred_opt),
    'MSE': mean_squared_error(y_test, y_pred_opt),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_opt)),
    'R2': r2_score(y_test, y_pred_opt)
}

errors = abs(y_pred_opt - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape

print("Accuracy:",accuracy)
print("Best parameters found: ", random_search.best_params_)
print("Performance after optimization:", performance_after)


Accuracy: 68.48345597448491
Best parameters found:  {'regressor__n_estimators': 200, 'regressor__min_samples_split': 5, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'auto', 'regressor__max_depth': 10, 'regressor__bootstrap': True}
Performance after optimization: {'MAE': 0.7686839890329623, 'MSE': 1.0039485016716314, 'RMSE': 1.0019723058406511, 'R2': 0.19682278680354692}


In [23]:
# Make a final pipeline
final_pipe = Pipeline(
    steps=[('Preprocessor',preprocessor),
          ('regressor',RandomForestRegressor(max_depth=10, max_features='auto', min_samples_split=5, min_samples_leaf=1, n_estimators=200, bootstrap=True, random_state=42))]
)



In [24]:
final_pipe.fit(X_train,y_train)

In [25]:
# Predict and print
y_pred = pipe.predict(X_test)
y_pred

array([2.8276, 2.418 , 3.8549, 3.3711, 1.9156, 3.3462, 4.3726, 1.818 ,
       2.3221, 2.734 , 3.3567, 2.059 , 1.8892, 2.5235, 1.7063, 2.9127,
       3.2356, 3.9649, 2.5726, 6.4015, 3.4574, 3.3501, 2.4659, 1.872 ,
       3.8822, 2.2887, 2.2218, 3.2396, 2.7256, 7.4529, 5.3994, 2.2925,
       2.4718, 3.3374, 2.0047, 3.8222, 2.021 , 4.2526, 1.9509, 3.4453,
       2.0512, 2.2332, 3.5632, 2.0762, 1.9577, 1.6346, 2.0092, 3.0746,
       2.1973])