<a href="https://colab.research.google.com/github/assaabriiii/AI/blob/main/tip_prediction_best_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od

In [55]:
!pip install opendatasets



In [56]:
od.download("https://www.kaggle.com/datasets/jsphyg/tipping")

Skipping, found downloaded files in "./tipping" (use force=True to force download)


In [57]:
df = pd.read_csv("tipping/tips.csv")
df = df[(df['tip'] <= 7) & (df['total_bill'] <= 45)]

In [58]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [59]:
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'time', 'day'], drop_first=True)
df_encoded

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,time_Lunch,day_Sat,day_Sun,day_Thur
0,16.99,1.01,2,False,False,False,False,True,False
1,10.34,1.66,3,True,False,False,False,True,False
2,21.01,3.50,3,True,False,False,False,True,False
3,23.68,3.31,2,True,False,False,False,True,False
4,24.59,3.61,4,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,True,False,False,True,False,False
240,27.18,2.00,2,False,True,False,True,False,False
241,22.67,2.00,2,True,True,False,True,False,False
242,17.82,1.75,2,True,False,False,True,False,False


In [60]:
X = df_encoded.drop('tip', axis=1)
y = df_encoded['tip']

In [61]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [62]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.26920511, -0.58032022, -1.31743394, ..., -0.73176763,
         1.48869615, -0.59352568],
       [-1.11491954,  0.50720901,  0.75905134, ..., -0.73176763,
         1.48869615, -0.59352568],
       [ 0.2420388 ,  0.50720901,  0.75905134, ..., -0.73176763,
         1.48869615, -0.59352568],
       ...,
       [ 0.45314947, -0.58032022,  0.75905134, ...,  1.36655402,
        -0.67172875, -0.59352568],
       [-0.16364978, -0.58032022,  0.75905134, ...,  1.36655402,
        -0.67172875, -0.59352568],
       [-0.04156168, -0.58032022, -1.31743394, ..., -0.73176763,
        -0.67172875,  1.68484708]])

In [63]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [64]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [65]:
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=3,
    min_samples_leaf=1,
    random_state=42
)

rf_model.fit(x_train, y_train)

In [66]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

train_pred = rf_model.predict(x_train)
test_pred = rf_model.predict(x_test)

train_mape = mean_absolute_percentage_error(y_train, train_pred)
test_mape = mean_absolute_percentage_error(y_test, test_pred)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print("--- TRAINING RESULTS ---")
print(f"Training Error (MAPE): {train_mape * 100:.2f}%")
print(f"Training R2 Score: {train_r2 * 100:.2f}%")

print("\n--- VALIDATION (TEST) RESULTS ---")
print(f"Validation Error (MAPE): {test_mape * 100:.2f}%")
print(f"Validation R2 Score: {test_r2 * 100:.2f}%")

--- TRAINING RESULTS ---
Training Error (MAPE): 24.82%
Training R2 Score: 50.57%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 23.82%
Validation R2 Score: 55.75%


In [67]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=2,
    subsample=0.8,
    random_state=42
)

gb_model.fit(x_train, y_train)

In [68]:
from sklearn.metrics import mean_squared_error

train_pred = gb_model.predict(x_train)
test_pred = gb_model.predict(x_test)

train_mape = mean_absolute_percentage_error(y_train, train_pred)
test_mape = mean_absolute_percentage_error(y_test, test_pred)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print("--- TRAINING RESULTS ---")
print(f"Training Error (MAPE): {train_mape * 100:.2f}%")
print(f"Training R2 Score: {train_r2 * 100:.2f}%")

print("\n--- VALIDATION (TEST) RESULTS ---")
print(f"Validation Error (MAPE): {test_mape * 100:.2f}%")
print(f"Validation R2 Score: {test_r2 * 100:.2f}%")

--- TRAINING RESULTS ---
Training Error (MAPE): 23.47%
Training R2 Score: 57.09%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 25.78%
Validation R2 Score: 50.65%


# without deleting the outliers:

## forest:
```
--- TRAINING RESULTS ---
Training Error (MAPE): 24.59%
Training R2 Score: 62.72%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 35.74%
Validation R2 Score: 21.52%
```

## gradiant boosting:
```
--- TRAINING RESULTS ---
Training Error (MAPE): 22.91%
Training R2 Score: 68.77%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 39.33%
Validation R2 Score: 8.75%
```

# deleting outliers:

## forest:
```
--- TRAINING RESULTS ---
Training Error (MAPE): 24.82%
Training R2 Score: 50.57%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 23.82%
Validation R2 Score: 55.75%
```

## gradiant boosting:

```
--- TRAINING RESULTS ---
Training Error (MAPE): 23.47%
Training R2 Score: 57.09%

--- VALIDATION (TEST) RESULTS ---
Validation Error (MAPE): 25.78%
Validation R2 Score: 50.65%
```