### ***Importing Libraries***

In [2]:

import pandas as pd
import numpy as np
from scipy import stats
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, median_absolute_error, mean_absolute_percentage_error


### ***Reading Dataset***

In [3]:
df = pd.read_csv('dataset/tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### ***Pre-processing***

In [155]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [156]:
df.shape

(244, 7)

In [4]:
# Transforming categorical items in numerical
encoder = LabelEncoder()
cols = ['sex', 'smoker', 'day', 'time']
for i in cols:
    df[i] = encoder.fit_transform(df[i])

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [5]:
# Outliers treatment with z score comparison
z = np.abs(stats.zscore(df))
df = df[(z < 3).all(axis=1)]
df.shape

(236, 7)

In [6]:
x = df[['total_bill', 'sex', 'smoker', 'day', 'time', 'size']]
y = df[['tip']]
x

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,0,0,2,0,2
1,10.34,1,0,2,0,3
2,21.01,1,0,2,0,3
3,23.68,1,0,2,0,2
4,24.59,0,0,2,0,4
...,...,...,...,...,...,...
239,29.03,1,0,1,0,3
240,27.18,0,1,1,0,2
241,22.67,1,1,1,0,2
242,17.82,1,0,1,0,2


In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 69)

### ***Machine Learning***

In [33]:
warnings.filterwarnings('ignore')
models = {
    'RFR': RandomForestRegressor(n_estimators = 12, random_state = 12),
    'Linear': LinearRegression(),
    'SVR': SVR()
}

results = []

for name, model in models.items():
    model_train = model
    model_train.fit(x_train, y_train)
            
    model_test_pred = model_train.predict(x_test)
                
    mae = np.round(mean_absolute_error(y_test, model_test_pred),2)
    medae = np.round(median_absolute_error(y_test, model_test_pred),2)
    mse = np.round(mean_squared_error(y_test, model_test_pred),2)
    rmse = np.round(np.sqrt(mean_squared_error(y_test, model_test_pred)),2)
    mape = np.round(mean_absolute_percentage_error(y_test, model_test_pred),2)
    r2 = np.round(r2_score(y_test, model_test_pred),2)
    hit = 1 - mean_absolute_percentage_error(y_test, model_test_pred)
            
    results.append([name, mae,medae,mse,rmse,mape,r2, f'{hit*100:.2f}%'])

res = pd.DataFrame(results, columns = ['Model', 'MAE', 'MedAE', 'MSE', 'RMSE', 'MAPE', 'R2_score', 'hit'])
res

Unnamed: 0,Model,MAE,MedAE,MSE,RMSE,MAPE,R2_score,hit
0,RFR,0.54,0.39,0.6,0.78,0.18,0.37,81.60%
1,Linear,0.61,0.47,0.66,0.81,0.2,0.31,79.59%
2,SVR,0.55,0.44,0.54,0.73,0.18,0.44,82.14%


*Overall, the support machine regressor method got the best metrics compared with others methods!!*