In [2]:

import os
import mlflow
import boto3
import numpy as np
from dotenv import load_dotenv
from botocore.exceptions import ClientError
import psycopg
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from mlflow.models.signature import infer_signature

# Constants
TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
EXPERIMENT_NAME = "churn_ivan_panchenko"
RUN_NAME = "feature_selection"
REGISTRY_MODEL_NAME = "churn_model_ivan_panchenko"
FS_ASSETS = "fs_assets"

# Load environment variables
load_dotenv()

def get_env_variable(var_name):
    value = os.getenv(var_name)
    if not value:
        raise ValueError(f"Environment variable {var_name} is not set in the .env file")
    return value

# Get environment variables
S3_ENDPOINT_URL = get_env_variable('S3_ENDPOINT_URL')
S3_BUCKET_NAME = get_env_variable('S3_BUCKET_NAME')
AWS_ACCESS_KEY_ID = get_env_variable('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = get_env_variable('AWS_SECRET_ACCESS_KEY')

# Database connection setup
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv('DB_DESTINATION_HOST'),
    "port": os.getenv('DB_DESTINATION_PORT'),
    "dbname": os.getenv('DB_DESTINATION_NAME'),
    "user": os.getenv('DB_DESTINATION_USER'),
    "password": os.getenv('DB_DESTINATION_PASSWORD')
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])
connection.update(postgres_credentials)

# Fetch data from database
with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
print(f"Table size: {df.shape[0]} rows; {df.shape[1]} columns")

# Define features and target
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"
test_size = 0.2

# Check if 'split' column exists, if not, create a random split
if 'split' not in df.columns:
    print("'split' column not found. Creating a random split.")
    df['split'] = np.random.choice(['train', 'test'], size=len(df), p=[1-test_size, test_size])

# Split the data
df_train = df[df['split'] == 'train']
df_test = df[df['split'] == 'test']

X_train = df_train[features]
y_train = df_train[target]
X_test = df_test[features]
y_test = df_test[target]

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Model parameters
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'learning_rate': [0.01, 0.1, 0.3],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [iterations],
    'loss_function': [loss_function],
    'task_type': [task_type],
    'random_seed': [random_seed],
    'verbose': [verbose]
}

# Initialize model and perform grid search
model = CatBoostClassifier(
    loss_function=loss_function,
    task_type=task_type,
    iterations=iterations,
    verbose=verbose,
    random_seed=random_seed
)

cv = GridSearchCV(estimator=model, param_grid=params, cv=2, n_jobs=-1)
clf = cv.fit(X_train, y_train)

# Set up MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# Process results
cv_results = pd.DataFrame(clf.cv_results_)
best_params = clf.best_params_

model_best = CatBoostClassifier(
    # loss_function=loss_function,
    # task_type=task_type,
    # iterations=iterations,
    # verbose=verbose,
    # random_seed=random_seed,
    **best_params
)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {}
tn, fp, fn, tp = confusion_matrix(y_test, prediction).ravel()
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics.update({
    "err1": err1, "err2": err2, "auc": auc, "precision": precision,
    "recall": recall, "f1": f1, "logloss": logloss,
    "mean_fit_time": cv_results['mean_fit_time'].mean(),
    "std_fit_time": cv_results['std_fit_time'].mean(),
    "mean_test_score": cv_results['mean_test_score'].mean(),
    "std_test_score": cv_results['std_test_score'].mean(),
    "best_score": clf.best_score_
})

# Log model with MLflow
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_params(best_params)
    mlflow.log_metrics(metrics)
    model_info = mlflow.catboost.log_model(
        cb_model=model_best,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    run_id = run.info.run_id

print(f"Run ID: {run_id}")


Table size: 7043 rows; 22 columns
'split' column not found. Creating a random split.
Training set size: (5645, 3)
Test set size: (1398, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_ivan_panchenko' already exists. Creating a new version of this model...
2025/08/11 18:39:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_ivan_panchenko, version 11
Created version '11' of model 'churn_model_ivan_panchenko'.


Run ID: 88fedc3f5a6a462fa182e41cc30ced86


In [5]:

import os
import mlflow
import numpy as np
from dotenv import load_dotenv
import psycopg
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from mlflow.models.signature import infer_signature

# Constants
TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
EXPERIMENT_NAME = "churn_ivan_panchenko"
RUN_NAME = "feature_selection"
REGISTRY_MODEL_NAME = "churn_model_ivan_panchenko"
FS_ASSETS = "fs_assets"

# Load environment variables
load_dotenv()

# Database connection and data fetching (unchanged)
# ...

# Define features and target
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"
split_column = "split"
stratify_column = "target"
test_size = 0.2

df = df.sort_values(by=[split_column])
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Model parameters
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    'learning_rate': [0.01, 0.1, 0.3],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [iterations],
    'loss_function': [loss_function],
    'task_type': [task_type],
    'random_seed': [random_seed],
    'verbose': [verbose]
}

# Initialize model and perform random search
model = CatBoostClassifier(
    loss_function=loss_function,
    task_type=task_type,
    iterations=iterations,
    verbose=verbose,
    random_seed=random_seed
)

cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=20, cv=2, n_jobs=-1, random_state=random_seed)
clf = cv.fit(X_train, y_train)

# Process results
cv_results = pd.DataFrame(clf.cv_results_)
best_params = clf.best_params_

model = CatBoostClassifier(
    # loss_function=loss_function,
    # task_type=task_type,
    # iterations=iterations,
    # verbose=verbose,
    # random_seed=random_seed,
    **best_params
)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {}
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics.update({
    "err1": err1, "err2": err2, "auc": auc, "precision": precision,
    "recall": recall, "f1": f1, "logloss": logloss,
    "mean_fit_time": cv_results['mean_fit_time'].mean(),
    "std_fit_time": cv_results['std_fit_time'].mean(),
    "mean_test_score": cv_results['mean_test_score'].mean(),
    "std_test_score": cv_results['std_test_score'].mean(),
    "best_score": clf.best_score_
})

# Log model with MLflow
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_params(best_params)
    mlflow.log_metrics(metrics)
    model_info = mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    run_id = run.info.run_id

print(f"Run ID: {run_id}")


Training set size: (5634, 3)
Test set size: (1409, 3)


  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_ivan_panchenko' already exists. Creating a new version of this model...
2025/08/11 18:46:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_ivan_panchenko, version 12
Created version '12' of model 'churn_model_ivan_panchenko'.


Run ID: bd29c3f29adb47bfb89af5fb03602fc5
