In [11]:
# чтение данных
import pandas as pd
from sklearn.metrics import mean_squared_error
import joblib
import json
import yaml
import os
from sklearn.model_selection import train_test_split

data = pd.read_csv('dataframe/churn_data.csv')

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['target']), data['target'], test_size=0.2, random_state=42
)

print(X_train[:2])
print(y_train[:2])


         id  begin_date end_date            type paperless_billing  \
1452   8471  2018-07-01      NaN  Month-to-month               Yes   
3269  10289  2017-01-01      NaN        One year               Yes   

                 payment_method  monthly_charges  total_charges  \
1452  Bank transfer (automatic)             45.0         865.85   
3269    Credit card (automatic)            104.5        3778.00   

     internet_service online_security online_backup device_protection  \
1452              DSL              No            No                No   
3269      Fiber optic              No           Yes                No   

     tech_support streaming_tv streaming_movies gender  senior_citizen  \
1452           No           No               No   Male               0   
3269          Yes          Yes              Yes   Male               0   

     partner dependents multiple_lines  
1452      No         No             No  
3269      No         No            Yes  
1452    0
3269    0
N

In [14]:
# обучение модели

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from category_encoders import CatBoostEncoder
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

# реализуйте основную логику шага с использованием гиперпараметров
cat_features = X_train.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = X_train.select_dtypes(['float'])

preprocessor = ColumnTransformer(
    [
        ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
        ('cat', CatBoostEncoder(return_df=False), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

model = CatBoostClassifier(auto_class_weights='Balanced', verbose=0)

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

pipeline.fit(X_train, y_train) 

print (model)

<catboost.core.CatBoostClassifier object at 0x7fbfec3baf20>


In [15]:
prediction = pipeline.predict(X_test)
print (prediction)

[0 0 0 ... 1 0 0]


In [19]:
# Сохранение модели

import os
import mlflow
import mlflow.sklearn
import pandas as pd

EXPERIMENT_NAME = "churn"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_arvas"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE3Nlz8iDILW5VTYM1ihQB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCPjvS7uwhvJpUj3bKm8X-IX4QAwBIVsvX61IL44"

mlflow.set_tracking_uri("http://localhost:5000")

X_transformed_test = pipeline.named_steps['preprocessor'].transform(X_test)

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_transformed_test, prediction)
input_example = X_transformed_test[:10]
metadata = {'model_type': 'monthly'}

print (model)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # ваш код здесь
    model_info = mlflow.catboost.log_model(
        cb_model=model,  # Ваш обученный экземпляр модели DecisionTreeClassifier
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        pip_requirements=pip_requirements,
        await_registration_for=60
    )

<catboost.core.CatBoostClassifier object at 0x7fbfec3baf20>


Registered model 'churn_model_arvas' already exists. Creating a new version of this model...
2025/09/10 05:38:56 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_arvas, version 2
Created version '2' of model 'churn_model_arvas'.


In [18]:
print (X_transformed_test)

[[ 1.          0.          0.         ...  0.17317271 -0.97151
   0.09356225]
 [ 1.          0.          1.         ...  0.17317271 -0.0029449
   1.0870915 ]
 [ 1.          0.          0.         ...  0.15181801 -0.02294969
   0.322973  ]
 ...
 [ 1.          1.          0.         ...  0.15181801  0.46883465
   0.24764112]
 [ 1.          1.          0.         ...  0.20609188 -1.51330631
  -0.80325745]
 [ 0.          1.          1.         ...  0.15181801  1.71246557
   0.71750625]]
