In [1]:
from catboost import CatBoostClassifier
import nbimporter
from ColumnTransformers import *
from sklearn.model_selection import GridSearchCV

In [None]:
def CatBoostPipeline(Numerical=['Transaction.Amount', 'Customer.Age','Account.Age.Days','Quantity'],
                    cat_features=["Payment.Method",'browser','Product.Category','Device.Used','source',"sex"]): 
    column_transformer = ColumnTransformer([
        ('time_features', TimeTransformer(),["Transaction.Date","Transaction.Hour"]), 
        ("high_amount",HighAmountTransformer(),["Transaction.Amount"]),
        ("numerical",StandardScaler(),Numerical), 
        ("age",AgeTransfomer(),["Customer.Age"])]) 
    return column_transformer

X,y=KCrossData()
CBP=CatBoostPipeline() 
CBP.fit(X) 
X_transformed=CBP.transform(x) 
cat_features=["Payment.Method",'browser','Product.Category','Device.Used','source',"sex"]
model = CatBoostClassifier(iterations=100) # number of trees
model.fit(X_transformed, y, cat_features=cat_features, verbose=10) # categorical features

In [None]:

param_grid = {
    'iterations': [100, 300, 500],           # Number of boosting iterations (trees)
    'learning_rate': [0.001, 0.01, 0.1],     # Step size shrinkage
    'depth': [4, 6, 8],                      # Depth of each tree
    'l2_leaf_reg': [1, 3, 5],                # L2 regularization
    'border_count': [32, 64, 128],           # Number of splits for numerical features
    'bagging_temperature': [0, 1, 5],        # Controls intensity of bagging
    'random_strength': [1, 5, 10],           # How random score is used for splits
    'od_type': ['Iter', 'IncToDec'],        # Type of overfitting detector
    'od_wait': [20, 50],                     # Rounds to wait before early stopping
    'scale_pos_weight': [1, 2, 5],           # Class weight scaling (for imbalanced data)
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS'],  # Type of bootstrap
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],  # Tree growth policy
    'colsample_bylevel': [0.5, 1.0],         # Subsample ratio of columns for each split level
    'min_data_in_leaf': [1, 10, 30],         # Minimum samples in leaf
    'max_leaves': [31, 64, 128]              
}

clf = CatBoostClassifier(
    iterations=50,
    cat_features=cat_features,
    verbose=20
)

grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

grid_search.best_estimator_.get_params()

In [None]:
from catboost import Pool 
pool=Pool(data=X,label=y,cat_features=cat_features)

In [None]:
from sklearn.model_selection import train_test_split

data = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_validation, y_train, y_validation = data

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)

validation_pool = Pool(
    data=X_validation,
    label=y_validation,
    cat_features=cat_features
)


In [None]:
# CrossEntropy for probabilities in target.

model = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
    loss_function='CrossEntropy'
)

model.fit(train_pool, eval_set=validation_pool, verbose=False)

print("Model is fitted: {}".format(model.is_fitted()))
print("Model params: {}".format(model.get_params()))


In [None]:
model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy'], 
    eval_metric="AUC",
    early_stopping=20
)

model.fit(
    train_pool,
    eval_set=validation_pool,
    verbose=False,
    plot=True
)
#select treshold

In [None]:
from catboost import cv

params = {
    'loss_function': 'Logloss',
    'iterations': 100,
    'custom_loss': 'AUC',
    'learning_rate': 0.2
}
# by default it stratifies
cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=3,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True,
    verbose=False
)

cv_data.head(10)

best_value = np.min(cv_data['test-Logloss-mean'])
best_iter = np.argmin(cv_data['test-Logloss-mean'])

print(
    "Best validation Logloss score, not stratified: {:.4f}±{:.4f} on step {}".format(
        best_value, 
        cv_data['test-Logloss-std'][best_iter], 
        best_iter
    )
)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.001, 0.01, 0.5]
}

clf = CatBoostClassifier(
    iterations=50,
    cat_features=cat_features,
    verbose=20
)

grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

grid_search.best_estimator_.get_params()

In [None]:
tunned_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    random_strength=1,
    bagging_temperature=1
)

tunned_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    verbose=False,
    eval_set=(X_validation, y_validation),
    plot=True
)