# Source:
https://www.kaggle.com/datasets/ravi72munde/uber-lyft-cab-prices?datasetId=195655&sortBy=relevance

# Libs:

In [41]:
import pandas as pd
import numpy as np
# Silence warnings
#import warnings
#warnings.filterwarnings('ignore')

from category_encoders.target_encoder import TargetEncoder

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier


from xgboost import XGBClassifier, XGBRFClassifier

# Prepare data

In [14]:
df = pd.read_csv('/home/antonius/Projects/DS_Projects/learn_XGBoost/data/cab_rides.csv')
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL


In [15]:
len(df)

693071

In [16]:
df.describe()

Unnamed: 0,distance,time_stamp,price,surge_multiplier
count,693071.0,693071.0,637976.0,693071.0
mean,2.18943,1544046000000.0,16.545125,1.01387
std,1.138937,689192500.0,9.324359,0.091641
min,0.02,1543204000000.0,2.5,1.0
25%,1.28,1543444000000.0,9.0,1.0
50%,2.16,1543737000000.0,13.5,1.0
75%,2.92,1544828000000.0,22.5,1.0
max,7.86,1545161000000.0,97.5,3.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance          693071 non-null  float64
 1   cab_type          693071 non-null  object 
 2   time_stamp        693071 non-null  int64  
 3   destination       693071 non-null  object 
 4   source            693071 non-null  object 
 5   price             637976 non-null  float64
 6   surge_multiplier  693071 non-null  float64
 7   id                693071 non-null  object 
 8   product_id        693071 non-null  object 
 9   name              693071 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 52.9+ MB


In [18]:
df.isna().mean().sort_values(ascending=False)

price               0.079494
distance            0.000000
cab_type            0.000000
time_stamp          0.000000
destination         0.000000
source              0.000000
surge_multiplier    0.000000
id                  0.000000
product_id          0.000000
name                0.000000
dtype: float64

In [19]:
df.dropna(inplace=True)

In [20]:
df.isna().mean().sort_values(ascending=False)

distance            0.0
cab_type            0.0
time_stamp          0.0
destination         0.0
source              0.0
price               0.0
surge_multiplier    0.0
id                  0.0
product_id          0.0
name                0.0
dtype: float64

In [23]:
df['date'] = pd.to_datetime(df['time_stamp'])
df.dtypes

distance                   float64
cab_type                    object
time_stamp                   int64
destination                 object
source                      object
price                      float64
surge_multiplier           float64
id                          object
product_id                  object
name                        object
date                datetime64[ns]
dtype: object

In [24]:
df['date'] = pd.to_datetime(df['time_stamp']*(10**6))
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223


In [26]:
import datetime as dt
df['month'] = df['date'].dt.month
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek

In [27]:
def weekend(row):
    if row['dayofweek'] in [5,6]:
        return 1
    else:
        return 0

df['weekend'] = df.apply(weekend, axis=1)

In [28]:
def rush_hour(row):
    if (row['hour'] in [6,7,8,9,15,16,17,18]) & (row['weekend'] == 0):
        return 1
    else:
        return 0

df['rush_hour'] = df.apply(rush_hour, axis=1)

In [29]:
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890,12,9,6,1,0
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677,11,2,1,0,0
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198,11,1,2,0,0
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749,11,4,4,0,0
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223,11,3,3,0,0


In [30]:
df.tail()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour
693065,1.0,Uber,1543708385534,North End,West End,9.5,1.0,353e6566-b272-479e-a9c6-98bd6cb23f25,9a0e7b09-b92b-4c41-9779-2ad22b4d779d,WAV,2018-12-01 23:53:05.534,12,23,5,1,0
693066,1.0,Uber,1543708385534,North End,West End,13.0,1.0,616d3611-1820-450a-9845-a9ff304a4842,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,2018-12-01 23:53:05.534,12,23,5,1,0
693067,1.0,Uber,1543708385534,North End,West End,9.5,1.0,633a3fc3-1f86-4b9e-9d48-2b7132112341,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,2018-12-01 23:53:05.534,12,23,5,1,0
693069,1.0,Uber,1543708385534,North End,West End,27.0,1.0,727e5f07-a96b-4ad1-a2c7-9abc3ad55b4e,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,2018-12-01 23:53:05.534,12,23,5,1,0
693070,1.0,Uber,1543708385534,North End,West End,10.0,1.0,e7fdc087-fe86-40a5-a3c3-3b2a8badcbda,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,2018-12-01 23:53:05.534,12,23,5,1,0


In [31]:
df['cab_type'].value_counts()

Uber    330568
Lyft    307408
Name: cab_type, dtype: int64

In [32]:
df.groupby('cab_type')['cab_type'].transform('count')

0         307408
1         307408
2         307408
3         307408
4         307408
           ...  
693065    330568
693066    330568
693067    330568
693069    330568
693070    330568
Name: cab_type, Length: 637976, dtype: int64

In [33]:
df['cab_freq'] = df.groupby('cab_type')['cab_type'].transform('count')

In [35]:
df['cab_freq'] = df['cab_freq']/len(df)

In [36]:
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour,cab_freq
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890,12,9,6,1,0,0.481849
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677,11,2,1,0,0,0.481849
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198,11,1,2,0,0,0.481849
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749,11,4,4,0,0,0.481849
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223,11,3,3,0,0,0.481849


## Category encoding 

In [39]:
encoder = TargetEncoder()
df['cab_type_mean'] = encoder.fit_transform(df['cab_type'], df['price'])

In [40]:
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,date,month,hour,dayofweek,weekend,rush_hour,cab_freq,cab_type_mean
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,2018-12-16 09:30:07.890,12,9,6,1,0,0.481849,17.351396
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,2018-11-27 02:00:23.677,11,2,1,0,0,0.481849,17.351396
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,2018-11-28 01:00:22.198,11,1,2,0,0,0.481849,17.351396
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,2018-11-30 04:53:02.749,11,4,4,0,0,0.481849,17.351396
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,2018-11-29 03:49:20.223,11,3,3,0,0,0.481849,17.351396


## load_breast_cancer

In [42]:
X, y = load_breast_cancer(return_X_y=True)

In [43]:
kfold = StratifiedKFold(n_splits=5)

In [44]:
def classification_model(model):
    # Obtain scores of cross-validation using 5 splits
    scores = cross_val_score(model, X, y, cv=kfold)

    # Return mean score
    return scores.mean()

In [45]:
classification_model(XGBClassifier())

0.9771619313771154

In [46]:
classification_model(XGBClassifier(booster='gblinear'))

0.6274181027790716

In [47]:
classification_model(XGBClassifier(booster='dart', one_drop=True))

0.9683744760130415

In [48]:
classification_model(RandomForestClassifier(random_state=2))

0.9666356155876418

In [49]:
classification_model(LogisticRegression(max_iter=10000))

0.9525694767893184

In [50]:
classification_model(
    XGBClassifier
        (n_estimators=800,
         max_depth=4,
         colsample_bylevel=0.8))

0.9789163173420276

## Split data

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# pred model 

In [52]:
def y_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    print(score)
    return y_pred

In [53]:
y_pred_gbtree = y_pred(XGBClassifier())

0.965034965034965


In [54]:
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))

0.965034965034965


In [55]:
y_pred_forest = y_pred(RandomForestClassifier(random_state=42))

0.965034965034965


In [56]:
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))

0.965034965034965


In [57]:
y_pred_xgb = y_pred(
    XGBClassifier(max_depth=2,
                  n_estimators=500,
                  learning_rate=0.1))

0.965034965034965


In [58]:
df_pred = pd.DataFrame(
    data= np.c_[y_pred_gbtree, y_pred_dart, y_pred_forest,
                y_pred_logistic, y_pred_xgb], 
                  columns=['gbtree', 'dart', 'forest', 'logistic', 'xgb'])

In [59]:
df_pred.corr()

Unnamed: 0,gbtree,dart,forest,logistic,xgb
gbtree,1.0,1.0,0.970021,0.910063,0.942602
dart,1.0,1.0,0.970021,0.910063,0.942602
forest,0.970021,0.970021,1.0,0.910063,0.942602
logistic,0.910063,0.910063,0.910063,1.0,0.883457
xgb,0.942602,0.942602,0.942602,0.883457,1.0
