In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/amongnikol/belajaraDataScienceJCOp/refs/heads/main/course3-regresi%26klasifikasiLanjutan/data/taxi_fare.csv', parse_dates=['pickup_datetime'])
df.dropna(inplace=True)

pickup_dt = df.pickup_datetime.dt
df['year'] = pickup_dt.year
df['month'] = pickup_dt.month
df['day'] = pickup_dt.day_of_week
df['hour'] = pickup_dt.hour
df.drop(columns='pickup_datetime', inplace=True)

df['distance'] = np.abs(df.pickup_longitude - df.dropoff_longitude) + np.abs(df.pickup_latitude - df.dropoff_latitude)
df.drop(columns=['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude'], inplace=True)

df.head()

Unnamed: 0,fare_amount,passenger_count,year,month,day,hour,distance
0,4.5,1.0,2009,6,0,17,0.011742
1,16.9,1.0,2010,1,1,16,0.107481
2,5.7,2.0,2011,8,3,0,0.019212
3,7.7,1.0,2012,4,5,4,0.029386
4,5.3,1.0,2010,3,1,7,0.027194


In [8]:
df = df[(df.distance > 0) & (df.fare_amount > 0)]

# correlation matrix

pearson
- linear relationship
- syarat: berdistribusi normal

spearman
- rank relationship
- syarat subjek yang berbeda
- contoh: Nilai yang diberikan oleh guru A vs guru B
- contoh: harga saham A vs saham B

kendall
- rank relationship
- syarat: subjek yang sama
- contoh: korelasi nilai quiz vs nilai ujian untuk murid A



In [9]:
from jcopml.plot import plot_correlation_matrix

plot_correlation_matrix(df, 'fare_amount', numeric_col = ['passenger_count', 'year', 'distance'])

interactive(children=(ToggleButtons(description='method', options=('spearman', 'kendall', 'pearson', 'pearson_…

# dataset splitting

In [10]:
x = df.drop(columns='fare_amount')
y = df.fare_amount

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((14705, 6), (3677, 6), (14705,), (3677,))

# training

In [11]:
x = df.drop(columns='fare_amount')
y = df.fare_amount

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((14705, 6), (3677, 6), (14705,), (3677,))

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from jcopml.tuning import random_search_params as rsp
from jcopml.tuning.space import Integer, Real

In [13]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, transform='yeo-johnson'), ['passenger_count', 'year', 'distance']),
    ('categoric', cat_pipe(encoder='onehot'), ['month', 'day', 'hour'])
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(random_state=42))
])

# cv = cross validation
model = RandomizedSearchCV(pipeline, rsp.xgb_params, cv=3, n_iter=50,  n_jobs=-1, verbose=1, random_state=42)
model.fit(x_train, y_train)

print(model.best_params_)
print(model.score(x_train, y_train), model.best_score_, model.score(x_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  loglike = -n_samples / 2 * np.log(x_trans.var())


{'algo__colsample_bytree': 0.6024641082463879, 'algo__gamma': 7, 'algo__learning_rate': 0.04764963542138517, 'algo__max_depth': 3, 'algo__n_estimators': 118, 'algo__reg_alpha': 0.8013508750140631, 'algo__reg_lambda': 3.8765111709116367, 'algo__subsample': 0.7435432121325587}
0.8122455180536658 0.7835487219299475 0.7909408458872208
