In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('D:/ds/leopard-challenge/2. classification/train.csv')

In [3]:
df_test = pd.read_csv('D:/ds/leopard-challenge/2. classification/test.csv')

# Machine learning моделирование

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(df,train_size=0.8,random_state=42,stratify=df['smoking'])

In [6]:
from catboost import CatBoostClassifier

In [7]:
y  = ['smoking']
X = list(df.drop(columns = ['smoking']).columns)
cat_features = list(df.select_dtypes(include = ['object']).columns)

In [41]:
from catboost import Pool

train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features
                 )

test_data = Pool(data=test[X],
                  label=test[y],
                  cat_features=cat_features
                 )

In [42]:
model = CatBoostClassifier(verbose=False)
model.fit(train_data)

<catboost.core.CatBoostClassifier at 0x29fbd4fcbe0>

In [43]:
y_pred = model.predict(test[X])

In [44]:
from sklearn.metrics import classification_report
print(classification_report(test[y], y_pred))

              precision    recall  f1-score   support

           0       0.81      0.98      0.89      2213
           1       0.60      0.12      0.20       560

    accuracy                           0.81      2773
   macro avg       0.71      0.55      0.54      2773
weighted avg       0.77      0.81      0.75      2773



In [45]:
from sklearn.metrics import accuracy_score
print(accuracy_score(test[y], y_pred))

0.8059862964298593


# optuna

In [46]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )
    model.fit(train_data)
    y_pred = model.predict(test[X])
    return accuracy_score(test[y], y_pred)

In [48]:
import optuna
from optuna.samplers import TPESampler


optuna.logging.set_verbosity(optuna.logging.WARNING)

from tqdm.notebook import trange, tqdm
for i in tqdm(range(100), colour='blue'):

        sampler = TPESampler(seed=1)
        study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
        study.optimize(objective, n_trials=5)
        pass

  0%|          | 0/100 [00:00<?, ?it/s]

In [54]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  5
Best trial:
  Value:  0.8070681572304363
  Params: 
    iterations: 475
    learning_rate: 0.027583475549166746
    depth: 4
    l2_leaf_reg: 1.0551779964424746e-05
    bootstrap_type: Bayesian
    random_strength: 2.0931628460945333e-07
    bagging_temperature: 0.923385947687978
    od_type: Iter
    od_wait: 26


In [121]:
params_one = {
    'learning_rate': 0.027583475549166746,
    'depth': 4,
    'l2_leaf_reg': 1.0551779964424746e-05,
    'loss_function': 'Logloss',
    'bootstrap_type': 'Bayesian',
    'random_strength': 2.0931628460945333e-07,
    'bagging_temperature': 0.923385947687978,
    'od_type': 'Iter',
    'od_wait': 26,
    'eval_metric':'F1',
    'auto_class_weights':'Balanced'
    }

In [122]:
model = CatBoostClassifier(**params_one, verbose=False)
model.fit(train_data)
y_pred = model.predict(test[X])

In [127]:
print(classification_report(test[y], y_pred))

              precision    recall  f1-score   support

           0       0.88      0.70      0.78      2213
           1       0.35      0.62      0.45       560

    accuracy                           0.69      2773
   macro avg       0.61      0.66      0.61      2773
weighted avg       0.77      0.69      0.71      2773



In [91]:
print(accuracy_score(test[y], y_pred))

0.6858997475658132


In [125]:
df_test['smoking'] = model.predict(df_test[X])

In [93]:
df_result = df_test[['ID','smoking']]
df_result.to_csv('my_sub_smoking.csv', index=False)