## Import modules

In [149]:
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## Read data


In [150]:
train_data = pd.read_csv("./Breast_cancer_data.csv")
train_data

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
1,20.57,17.77,132.90,1326.0,0.08474,0
2,19.69,21.25,130.00,1203.0,0.10960,0
3,11.42,20.38,77.58,386.1,0.14250,0
4,20.29,14.34,135.10,1297.0,0.10030,0
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0
565,20.13,28.25,131.20,1261.0,0.09780,0
566,16.60,28.08,108.30,858.1,0.08455,0
567,20.60,29.33,140.10,1265.0,0.11780,0


In [151]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB


In [152]:
train_data.diagnosis.value_counts()

1    357
0    212
Name: diagnosis, dtype: int64

In [153]:
x_data = train_data.drop(['diagnosis'], axis=1)
y_data = train_data.diagnosis

In [154]:
from sklearn.model_selection import train_test_split

In [155]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)

## Train

In [171]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [172]:
gkf = KFold(n_splits=5, shuffle=True, random_state=42).split(X=x_train, y=y_train)

In [173]:
params_lgb_grid = {
    'num_leaves': [31, 127],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }

In [174]:
lgb_clf = LGBMClassifier(boosting_type='gbdt',  objective='binary', num_boost_round=2000, learning_rate=0.01, metric='auc')
lgb_grid_search = GridSearchCV(estimator=lgb_clf, param_grid=params_lgb_grid, n_jobs=4, cv=gkf, verbose=3)

In [175]:
lgb_grid_search.fit(x_train, y_train)
lgb_clf = lgb_grid_search.best_estimator_

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 176 tasks      | elapsed:   11.0s
[Parallel(n_jobs=4)]: Done 477 tasks      | elapsed:   30.4s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:   37.2s finished


In [212]:
xgb_clf = XGBClassifier(max_depth=6, min_child_weight=11, learning_rate=0.01, n_estimators=600, objective='binary:logistic')

In [None]:
xgb_clf.fit(x_train, y_train)

In [None]:
'loss_function':'Logloss', # objective function
          'eval_metric':'AUC', # metric
          'verbose': 200, # output to stdout info about training process every 200 iterations
          'random_seed': SEED

catboost_clf = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC')

In [None]:
catboost_clf.fit(x_train, y_train)

## Evaluate

In [147]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, roc_auc_score

In [148]:
y_pred = lgb_clf.predict(x_test)
print('Accuracy', accuracy_score(y_test, y_pred))
print('F1', f1_score(y_test, y_pred).astype(float))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred).astype(float))

Accuracy 0.9239766081871345
F1 0.9400921658986177
Precision 0.9357798165137615
Recall 0.9444444444444444


In [94]:
y_pred = xgb_clf.predict(x_test)
print('Accuracy', accuracy_score(y_test, y_pred))
print('F1', f1_score(y_test, y_pred).astype(float))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred).astype(float))

Accuracy 0.935672514619883
F1 0.9497716894977168
Precision 0.9369369369369369
Recall 0.9629629629629629


In [None]:
y_pred = catboost_clf.predict(x_test)
print('Accuracy', accuracy_score(y_test, y_pred))
print('F1', f1_score(y_test, y_pred).astype(float))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred).astype(float))

In [97]:
xgb_clf.save_model('xgboost_clf_model.json')
lgb_clf.booster_.save_model('lightgbm_clf_model.txt')
catboost_clf.save_model("catboost_clf.moodel")

<lightgbm.basic.Booster at 0x7f34ae10d2e0>