In [58]:
# чтение данных из файла
import pandas as pd
from sklearn.metrics import mean_squared_error
import joblib
import json
import yaml
import os
from sklearn.model_selection import train_test_split

df = pd.read_csv('dataframe/churn_data.csv')

df[:2]

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2017-10-01,,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,No,...,Yes,Yes,No,No,Female,1,No,No,Yes,0
1,7021,2018-10-01,,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,No,...,No,No,No,No,Male,1,No,No,No,0


In [60]:
# чтение данных из базы

import os, psycopg
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "users_churn"# таблица с данными

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df[:2]

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2043,7361-YPXFS,2017-10-01,NaT,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,...,Yes,Yes,No,No,Female,1,No,No,Yes,0
1,2044,6557-BZXLQ,2018-10-01,NaT,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,...,No,No,No,No,Male,1,No,No,No,0


In [87]:
# ===== Трансформация данных (тупая) =====
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from category_encoders import CatBoostEncoder
from sklearn.pipeline import Pipeline

# разделяем признаки на категории и числа
cat_columns = [
    'type',
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines'
]
num_columns = [
    'monthly_charges',
    'total_charges'
]

# выбираем энкодеры
encoder_ss = StandardScaler()
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse_output=False, drop='first')

# а теперь все вместе
preprocessor = ColumnTransformer(
    transformers=[
        ('num', encoder_ss, num_columns),  
        ('cat', encoder_oh, cat_columns)
    ], 
    n_jobs=-1
)

encoded_features = preprocessor.fit_transform(df)
transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

transformed_df = pd.concat([transformed_df, df['target']], axis=1)
pd.set_option('display.max_columns', None)  # Отображать все колонки
transformed_df.head(10)

Unnamed: 0,num__monthly_charges,num__total_charges,cat__type_One year,cat__type_Two year,cat__paperless_billing_Yes,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__online_security_Yes,cat__online_security_None,cat__online_backup_Yes,cat__online_backup_None,cat__device_protection_Yes,cat__device_protection_None,cat__tech_support_Yes,cat__tech_support_None,cat__streaming_tv_Yes,cat__streaming_tv_None,cat__streaming_movies_Yes,cat__streaming_movies_None,cat__gender_Male,cat__senior_citizen_1,cat__partner_Yes,cat__dependents_Yes,cat__multiple_lines_Yes,cat__multiple_lines_None,target
0,-0.010359,-0.183402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
1,0.162467,-0.547073,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
2,-1.504315,-0.950627,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,1.520156,2.281888,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,-1.331488,-0.996466,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
5,0.420046,-0.916413,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
6,1.064823,-0.111621,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
7,0.685934,1.582142,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
8,-0.570385,0.482879,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
9,-1.507638,-0.413437,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0


In [75]:
# разделяем выборки для обучения и для тестирования
X_train, X_test, y_train, y_test = train_test_split(
    transformed_df.drop(columns=['target']), transformed_df['target'], test_size=0.2, random_state=42
)
print(X_train[:2])
print(y_train[:2])

      num__monthly_charges  num__total_charges  cat__type_One year  \
2142              1.141266           -0.378054                 0.0   
1623             -1.309885           -0.192248                 0.0   

      cat__type_Two year  cat__paperless_billing_Yes  \
2142                 0.0                         1.0   
1623                 1.0                         0.0   

      cat__payment_method_Credit card (automatic)  \
2142                                          1.0   
1623                                          1.0   

      cat__payment_method_Electronic check  cat__payment_method_Mailed check  \
2142                                   0.0                               0.0   
1623                                   0.0                               0.0   

      cat__internet_service_Fiber optic  cat__internet_service_None  ...  \
2142                                1.0                         0.0  ...   
1623                                0.0                         1.0

In [76]:
# обучение модели

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from category_encoders import CatBoostEncoder
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

model = CatBoostClassifier(auto_class_weights='Balanced', verbose=0)


model.fit(X_train, y_train) 

print (model)

<catboost.core.CatBoostClassifier object at 0x7f6d5e6a49a0>


In [81]:
prediction = model.predict(X_test)
prediction_proba = model.predict_proba(X_test)[:, 1]
print(prediction)
print(prediction_proba)

[0 0 0 ... 0 0 1]
[0.448959   0.26744113 0.02755057 ... 0.03537866 0.01214334 0.66147966]


In [82]:
# Вычисление метрик
from sklearn.metrics import f1_score, roc_auc_score

f1 = f1_score(y_test, prediction)
roc_auc = roc_auc_score(y_test, prediction_proba)

print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

F1 Score: 0.6266968325791855
ROC AUC: 0.8466227080984005


In [86]:
# Сохранение модели и метрик

import os
import mlflow
import mlflow.sklearn
import pandas as pd

EXPERIMENT_NAME = "churn"
RUN_NAME = "model_base"
REGISTRY_MODEL_NAME = "churn_model_arvas"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE3Nlz8iDILW5VTYM1ihQB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCPjvS7uwhvJpUj3bKm8X-IX4QAwBIVsvX61IL44"

mlflow.set_tracking_uri("http://localhost:5000")

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]
metadata = {'model_learn': 'base'}

print (model)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # ваш код здесь
    model_info = mlflow.catboost.log_model(
        cb_model=model,  # Ваш обученный экземпляр модели DecisionTreeClassifier
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        pip_requirements=pip_requirements,
        await_registration_for=60
    )
    # Логирование метрик
    mlflow.log_metric("F1_Score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)


<catboost.core.CatBoostClassifier object at 0x7f6d5e6a49a0>


Registered model 'churn_model_arvas' already exists. Creating a new version of this model...
2025/09/13 12:04:44 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_arvas, version 4
Created version '4' of model 'churn_model_arvas'.


In [19]:
# Сохранение модели

import os
import mlflow
import mlflow.sklearn
import pandas as pd

EXPERIMENT_NAME = "churn"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_arvas"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE3Nlz8iDILW5VTYM1ihQB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCPjvS7uwhvJpUj3bKm8X-IX4QAwBIVsvX61IL44"

mlflow.set_tracking_uri("http://localhost:5000")

X_transformed_test = pipeline.named_steps['preprocessor'].transform(X_test)

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_transformed_test, prediction)
input_example = X_transformed_test[:10]
metadata = {'model_type': 'monthly'}

print (model)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # ваш код здесь
    model_info = mlflow.catboost.log_model(
        cb_model=model,  # Ваш обученный экземпляр модели DecisionTreeClassifier
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        pip_requirements=pip_requirements,
        await_registration_for=60
    )

<catboost.core.CatBoostClassifier object at 0x7fbfec3baf20>


Registered model 'churn_model_arvas' already exists. Creating a new version of this model...
2025/09/10 05:38:56 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_arvas, version 2
Created version '2' of model 'churn_model_arvas'.


In [18]:
print (X_transformed_test)

[[ 1.          0.          0.         ...  0.17317271 -0.97151
   0.09356225]
 [ 1.          0.          1.         ...  0.17317271 -0.0029449
   1.0870915 ]
 [ 1.          0.          0.         ...  0.15181801 -0.02294969
   0.322973  ]
 ...
 [ 1.          1.          0.         ...  0.15181801  0.46883465
   0.24764112]
 [ 1.          1.          0.         ...  0.20609188 -1.51330631
  -0.80325745]
 [ 0.          1.          1.         ...  0.15181801  1.71246557
   0.71750625]]
