In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [18]:
df = sns.load_dataset('tips')

In [19]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [20]:

categorical_features = ['sex', 'smoker', 'day', 'time']
numerical_features = ['total_bill', 'size']


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipe, numerical_features),
    ('cat', cat_pipe, categorical_features)
]
)

In [22]:
X = df[categorical_features + numerical_features]
y = df['tip']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.linear_model import LinearRegression

reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

reg_pipeline.fit(X_train, y_train)

y_pred = reg_pipeline.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
print("\nRegression Results:")
print(f"RMSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"R² Score: {r2_score(y_test, y_pred):.2f}")


Regression Results:
RMSE: 0.70
R² Score: 0.44


## Classification: Predict smoker

In [31]:
categorical_features = ['sex', 'day', 'time']
numerical_features = ['total_bill', 'size']

X_clf = df[categorical_features + numerical_features]
y_clf = df['smoker'].map({"Yes":1,"No":0})


X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)


num_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipe, numerical_features),
    ('cat', cat_pipe, categorical_features)
]
)


from sklearn.linear_model import LogisticRegression
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

clf_pipeline.fit(X_train, y_train)

y_pred_clf = clf_pipeline.predict(X_test)


from sklearn.metrics import accuracy_score,classification_report
print("\nClassification Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_clf):.2f}")
print(classification_report(y_test, y_pred_clf))


Classification Results:
Accuracy: 0.71
              precision    recall  f1-score   support

           0       0.70      0.97      0.81        31
           1       0.83      0.28      0.42        18

    accuracy                           0.71        49
   macro avg       0.77      0.62      0.61        49
weighted avg       0.75      0.71      0.67        49



In [32]:
y_clf.value_counts()

smoker
0    151
1     93
Name: count, dtype: int64

In [35]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [36]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X_train_transformed, y_train)

y_sm.value_counts()

smoker
1    120
0    120
Name: count, dtype: int64

In [37]:

clf_pipeline.fit(X_sm, y_sm)

y_pred_clf = clf_pipeline.predict(X_test)


from sklearn.metrics import accuracy_score,classification_report
print("\nClassification Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_clf):.2f}")
print(classification_report(y_test, y_pred_clf))

ValueError: Specifying the columns using strings is only supported for dataframes.