In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sklearn

In [3]:
import statsmodels

In [4]:
from sklearn import linear_model

In [5]:
df_tips = sns.load_dataset('tips') ## Import Data

## Quick EDA

In [6]:
df_tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
df_tips.isna().sum() ## No Missing Value

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

## Features Engineering

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [9]:
df_tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [10]:
df_tips.drop(columns=['sex','smoker', 'day', 'time'],inplace=True) ## Drop categorical column

In [11]:
## Splitting Data
X = df_tips.drop(columns='tip')
y = df_tips['tip']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .80, random_state=42)

In [14]:
Poli = PolynomialFeatures(degree=3, include_bias=False, interaction_only = True)

In [16]:
Poli = Poli.fit(X_train)

In [17]:
X_trainPoli = Poli.transform(X_train)
X_testPoli = Poli.transform(X_test)

In [18]:
df_XtrainPoli = pd.DataFrame(X_trainPoli)
df_XtrainPoli.head()

Unnamed: 0,0,1,2
0,13.28,2.0,26.56
1,24.27,2.0,48.54
2,27.28,2.0,54.56
3,31.71,4.0,126.84
4,15.98,2.0,31.96


In [19]:
df_XtestPoli = pd.DataFrame(X_testPoli)
df_XtestPoli.head()

Unnamed: 0,0,1,2
0,19.82,2.0,39.64
1,8.77,2.0,17.54
2,24.55,4.0,98.2
3,25.89,4.0,103.56
4,13.0,2.0,26.0


## Model Training Polynomial Regression

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
Model_PF = LinearRegression()

In [23]:
Model_PF.fit(df_XtrainPoli, y_train)

In [24]:
PF_train = Model_PF.predict(df_XtrainPoli)
PF_test = Model_PF.predict(df_XtestPoli)

## Evaluation Matrix

### Training

In [27]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, median_absolute_error

In [28]:
r2_PF = r2_score(y_train, PF_train) ## R2 dari prediksi Polynomial Features
r2_PF

0.45787215528923264

In [29]:
MAE_PF = mean_absolute_error(y_train, PF_train)
MAE_PF

0.7642752099601574

In [30]:
MSE_PF = mean_squared_error(y_train, PF_train)
MSE_PF

1.1014451782959163

In [31]:
RMSE_PF = np.sqrt(MSE_PF)
RMSE_PF

1.0494975837494416

### Testing

In [33]:
r2_PF_test = r2_score(y_test, PF_test) 
r2_PF_test

0.3910987381318939

In [34]:
MAE_PF_test = mean_absolute_error(y_test, PF_test)
MAE_PF_test

0.7141891098244093

In [35]:
MSE_PF_test = mean_squared_error(y_test, PF_test)
MSE_PF_test

0.761109129435584

In [36]:
RMSE_PF_test = np.sqrt(MSE_PF_test)
RMSE_PF_test

0.8724156861471394

In [37]:
eva_data = {
    'Polynomial Training' : [r2_PF, MAE_PF, MSE_PF, RMSE_PF],
    'Polynomial Testing' : [r2_PF_test, MAE_PF_test, MSE_PF_test, RMSE_PF_test]
}
pd.DataFrame(eva_data, index=['R2', 'MAE', 'MSE', 'RMSE'])

Unnamed: 0,Polynomial Training,Polynomial Testing
R2,0.457872,0.391099
MAE,0.764275,0.714189
MSE,1.101445,0.761109
RMSE,1.049498,0.872416
