In [1]:
import pandas as pd 
import catboost as cb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('data/tips.csv')

In [3]:
df.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


In [4]:
df.tail(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
243,18.78,3.0,Female,No,Thur,Dinner,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null object
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [7]:
df.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


Naive model

In [8]:
df['15%_tip'] = (df['total_bill']*(.15))

In [9]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,15%_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,2.5485
1,10.34,1.66,Male,No,Sun,Dinner,3,1.5510
2,21.01,3.50,Male,No,Sun,Dinner,3,3.1515
3,23.68,3.31,Male,No,Sun,Dinner,2,3.5520
4,24.59,3.61,Female,No,Sun,Dinner,4,3.6885
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,4.3545
240,27.18,2.00,Female,Yes,Sat,Dinner,2,4.0770
241,22.67,2.00,Male,Yes,Sat,Dinner,2,3.4005
242,17.82,1.75,Male,No,Sat,Dinner,2,2.6730


In [10]:
target = 'tip'
y = df[target]
X = df[['15%_tip']]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
model = LinearRegression().fit(X_train, y_train)
model.score(X_test, y_test)

0.4401350211140492

In [13]:
mean_squared_error(y_test, model.predict(X_test))**(1/2)

0.8630517194793621

Basic Model

In [8]:
target = 'tip'
y = df[target]
X = df.drop(target, axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
mapper = DataFrameMapper([
    (['total_bill'], StandardScaler()),
    ('sex', LabelBinarizer()),
    ('smoker', LabelBinarizer()),
    ('day', LabelEncoder()),
    ('time', LabelEncoder()),
    (['size'], StandardScaler())
    ], df_out= True
)

In [11]:
Z_train = mapper.fit_transform(X_train)

In [12]:
Z_test = mapper.transform(X_test)

In [15]:
viewing = mapper

In [None]:
viewing

In [19]:
model = LinearRegression().fit(Z_train, y_train)
model.score(Z_test, y_test)

0.3470695928047467

In [20]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

0.9320272674782091

Ridge

In [21]:
model = Ridge()
params = {
    'alpha': [20, 25, 30],
    'fit_intercept': [True, False],
}

In [22]:
grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)

In [23]:
grid.fit(Z_train, y_train)
grid.best_score_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    4.5s finished


0.4234584219585732

In [24]:
grid.best_params_

{'alpha': 20, 'fit_intercept': True}

In [25]:
model = grid.best_estimator_
model.score(Z_test, y_test)

0.39681352057252256

In [26]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

0.8958204455010627

Lasso

In [27]:
model = Lasso()
params = {
    'alpha': [.01, .1, 1],
    'fit_intercept': [True, False],
}

In [28]:
grid = GridSearchCV(model, params, cv=3, n_jobs=-1, verbose=1)

In [29]:
grid.fit(Z_train, y_train)
grid.best_score_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished


Fitting 3 folds for each of 6 candidates, totalling 18 fits


0.42258799195737845

In [30]:
grid.best_params_

{'alpha': 0.1, 'fit_intercept': True}

In [31]:
model = grid.best_estimator_
model.score(Z_test, y_test)

0.4473621862600412

In [33]:
mean_squared_error(y_test, model.predict(Z_test))**(1/2)

0.8574631598621847

CatBoost

In [53]:
df = pd.read_csv('data/tips.csv')

In [54]:
target = 'tip'
y = df[target]
X = df.drop(target, axis=1)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
mapper = DataFrameMapper([
    (['total_bill'], StandardScaler()),
    ('sex', LabelBinarizer()),
    ('smoker', LabelBinarizer()),
    ('day', LabelEncoder()),
    ('time', LabelEncoder()),
    (['size'], StandardScaler())
    ], df_out= True
)

In [57]:
Z_train = mapper.fit_transform(X_train)

In [58]:
Z_test = mapper.transform(X_test)

In [59]:
model = cb.CatBoostRegressor(
    iterations=100, 
    early_stopping_rounds=10,
)

model.fit(Z_train, y_train)

0:	learn: 1.4230518	total: 66.9ms	remaining: 6.63s
1:	learn: 1.4095445	total: 68.2ms	remaining: 3.34s
2:	learn: 1.3960678	total: 69ms	remaining: 2.23s
3:	learn: 1.3813156	total: 69.7ms	remaining: 1.67s
4:	learn: 1.3698852	total: 70.7ms	remaining: 1.34s
5:	learn: 1.3561161	total: 71.7ms	remaining: 1.12s
6:	learn: 1.3414718	total: 72.6ms	remaining: 964ms
7:	learn: 1.3290094	total: 73.5ms	remaining: 845ms
8:	learn: 1.3195516	total: 74.1ms	remaining: 749ms
9:	learn: 1.3098737	total: 74.8ms	remaining: 673ms
10:	learn: 1.2976481	total: 75.7ms	remaining: 613ms
11:	learn: 1.2858576	total: 76.6ms	remaining: 562ms
12:	learn: 1.2731927	total: 77.5ms	remaining: 518ms
13:	learn: 1.2632882	total: 78.3ms	remaining: 481ms
14:	learn: 1.2541331	total: 79.1ms	remaining: 448ms
15:	learn: 1.2434327	total: 79.9ms	remaining: 419ms
16:	learn: 1.2343088	total: 80.5ms	remaining: 393ms
17:	learn: 1.2237547	total: 81.4ms	remaining: 371ms
18:	learn: 1.2140657	total: 82.2ms	remaining: 350ms
19:	learn: 1.2053453	tot

<catboost.core.CatBoostRegressor at 0x1a1f560290>

In [60]:
model.score(Z_test, y_test)

0.4013664530303154