In [36]:
import pandas as pd 
import catboost as cb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('data/tips.csv')

In [3]:
df.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [4]:
df.tail(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.0,Female,No,Thur,Dinner,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null object
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [7]:
df.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


Naive model

In [8]:
df['15%_tip'] = (df['total_bill']*(.15))

In [9]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,15%_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,2.5485
1,10.34,1.66,Male,No,Sun,Dinner,3,1.5510
2,21.01,3.50,Male,No,Sun,Dinner,3,3.1515
3,23.68,3.31,Male,No,Sun,Dinner,2,3.5520
4,24.59,3.61,Female,No,Sun,Dinner,4,3.6885
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,4.3545
240,27.18,2.00,Female,Yes,Sat,Dinner,2,4.0770
241,22.67,2.00,Male,Yes,Sat,Dinner,2,3.4005
242,17.82,1.75,Male,No,Sat,Dinner,2,2.6730


In [10]:
target = 'tip'
y = df[target]
X = df[['15%_tip']]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
model = LinearRegression().fit(X_train, y_train)
model.score(X_test, y_test)

0.4401350211140492

In [13]:
mean_squared_error(y_test, model.predict(X_test))**(1/2)

0.8630517194793621

Basic Model

In [14]:
target = 'tip'
y = df[target]
X = df.drop(target, axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
mapper = DataFrameMapper([
    (['total_bill'], StandardScaler()),
    ('sex', LabelBinarizer()),
    ('smoker', LabelBinarizer()),
    ('day', LabelEncoder()),
    ('time', LabelEncoder()),
    (['size'], StandardScaler())
    ], df_out= True
)

In [17]:
Z_train = mapper.fit_transform(X_train)

In [18]:
Z_test = mapper.transform(X_test)

In [19]:
model = LinearRegression().fit(Z_train, y_train)
model.score(Z_test, y_test)

0.3470695928047467

In [20]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

0.9320272674782091

Ridge

In [21]:
model = Ridge()
params = {
    'alpha': [20, 25, 30],
    'fit_intercept': [True, False],
}

In [22]:
grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)

In [23]:
grid.fit(Z_train, y_train)
grid.best_score_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   14.2s finished


0.4234584219585732

In [24]:
grid.best_params_

{'alpha': 20, 'fit_intercept': True}

In [25]:
model = grid.best_estimator_
model.score(Z_test, y_test)

0.39681352057252256

In [26]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

0.8958204455010627

CatBoost

In [28]:
df = pd.read_csv('data/tips.csv')

In [29]:
target = 'tip'
y = df[target]
X = df.drop(target, axis=1)

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
mapper = DataFrameMapper([
    (['total_bill'], StandardScaler()),
    ('sex', LabelBinarizer()),
    ('smoker', LabelBinarizer()),
    ('day', LabelEncoder()),
    ('time', LabelEncoder()),
    (['size'], StandardScaler())
    ], df_out= True
)

In [None]:
Z_train = mapper.fit_transform(X_train)

In [None]:
Z_val = mapper.transform(X_val)

In [None]:
model = cb.CatBoostClassifier(
    iterations=100, 
    early_stopping_rounds=10,
    custom_loss=['AUC', 'Accuracy']
)

model.fit(
    Z_train, 
    y_train,
    eval_set=(Z_val, y_val),
    verbose=False,
    plot=True)