In [1]:
import mlflow

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [23]:
import pandas as pd
df = pd.read_csv('users_churn.csv')
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,5851,6927-WTFIV,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),71.3,1389.2,Fiber optic,...,No,No,No,No,Male,1,No,No,No,1
1,5852,4118-CEVPF,2017-03-01,,One year,Yes,Bank transfer (automatic),110.8,3836.3,Fiber optic,...,Yes,Yes,Yes,Yes,Female,1,No,No,Yes,0
2,5853,3398-ZOUAA,2018-01-01,2019-10-01,Month-to-month,Yes,Electronic check,69.1,1474.75,Fiber optic,...,No,No,No,No,Male,1,Yes,No,No,1
3,5854,9114-VEPUF,2014-12-01,,One year,No,Electronic check,96.1,6001.45,Fiber optic,...,No,No,No,Yes,Male,0,Yes,No,Yes,0
4,5855,7876-BEUTG,2018-11-01,,Month-to-month,Yes,Mailed check,48.8,720.1,DSL,...,No,No,No,No,Female,0,No,No,Yes,0


In [3]:
import os


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

TABLE_NAME = 'users_churn'

EXPERIMENT_NAME = 'churn_volkovandrey_test'
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'baseline_model'

In [24]:
obj_df = df.select_dtypes(include="object")
obj_df.head() #вывод нечисловых колонок

Unnamed: 0,customer_id,begin_date,end_date,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,6927-WTFIV,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),Fiber optic,No,No,No,No,No,No,Male,No,No,No
1,4118-CEVPF,2017-03-01,,One year,Yes,Bank transfer (automatic),Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Female,No,No,Yes
2,3398-ZOUAA,2018-01-01,2019-10-01,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,No,Male,Yes,No,No
3,9114-VEPUF,2014-12-01,,One year,No,Electronic check,Fiber optic,Yes,Yes,No,No,No,Yes,Male,Yes,No,Yes
4,7876-BEUTG,2018-11-01,,Month-to-month,Yes,Mailed check,DSL,No,No,No,No,No,No,Female,No,No,Yes


In [6]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(encoded_features, columns=encoder_oh.get_feature_names_out())

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,customer_id,begin_date,end_date,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,...,dependents,multiple_lines,x0_One year,x0_Two year,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x2_nan,x3_Male
0,6927-WTFIV,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),Fiber optic,No,No,No,...,No,No,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,4118-CEVPF,2017-03-01,,One year,Yes,Bank transfer (automatic),Fiber optic,Yes,Yes,Yes,...,No,Yes,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
df['total_charges'] = df['total_charges'].fillna(df['total_charges'].mean())


In [32]:
df.isnull().sum()


id                      0
customer_id             0
begin_date              0
end_date             5174
type                    0
paperless_billing       0
payment_method          0
monthly_charges         0
total_charges           0
internet_service     1526
online_security      1526
online_backup        1526
device_protection    1526
tech_support         1526
streaming_tv         1526
streaming_movies     1526
gender                  0
senior_citizen          0
partner                 0
dependents              0
multiple_lines        682
target                  0
dtype: int64

In [44]:
import pandas as pd
from sklearn.preprocessing import SplineTransformer, QuantileTransformer, RobustScaler, PolynomialFeatures, KBinsDiscretizer

num_columns = ["monthly_charges", "total_charges"]
n_knots = 3
degree_spline = 4
n_quantiles = 100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

num_df = df[num_columns]

# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_spl.get_feature_names_out(num_columns))
num_df = pd.concat([num_df, encoded_df], axis=1)

# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns))
# print(type(encoded_df.columns))
# encoded_df.columns = encoded_df.columns[1 + len(num_columns):]
encoded_df = encoded_df[encoded_df.columns[1 + len(num_columns):]]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

num_df.columns


Index(['monthly_charges', 'total_charges', 'monthly_charges_sp_0',
       'monthly_charges_sp_1', 'monthly_charges_sp_2', 'monthly_charges_sp_3',
       'monthly_charges_sp_4', 'monthly_charges_sp_5', 'total_charges_sp_0',
       'total_charges_sp_1', 'total_charges_sp_2', 'total_charges_sp_3',
       'total_charges_sp_4', 'total_charges_sp_5', 'monthly_charges_q_100',
       'total_charges_q_100', 'monthly_charges_robust', 'total_charges_robust',
       'monthly_charges^2', 'monthly_charges total_charges', 'total_charges^2',
       'monthly_charges^3', 'monthly_charges^2 total_charges',
       'monthly_charges total_charges^2', 'total_charges^3',
       'monthly_charges_bin', 'total_charges_bin'],
      dtype='object')

In [45]:
# Преобразования для числовых колонок
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)


# Преобразования для категориальных колонок
categorical_transformer = Pipeline(
    steps=[
        ('encoder', encoder_oh)
    ]
)

# Объединение двух преобразований
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns)
    ],
    n_jobs=-1)

# Применение преобразований к данным
encoded_features = preprocessor.fit_transform(df)

# Преобразование результата в DataFrame
transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

# Объединение преобразованного набора данных с изначальным
df = pd.concat([df, transformed_df], axis=1)
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_nan,cat__gender_Male
0,5851,6927-WTFIV,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),71.3,1389.2,Fiber optic,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,5852,4118-CEVPF,2017-03-01,,One year,Yes,Bank transfer (automatic),110.8,3836.3,Fiber optic,...,4.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [46]:
preprocessor

In [47]:
from dotenv import load_dotenv

# Load credentials from .env file
load_dotenv()

os.environ["MLFLOW_S3_ENDPOINT_URL"] = os.getenv('MLFLOW_S3_ENDPOINT_URL')
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer") 



In [None]:
df.columns


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_nan,cat__gender_Male
0,5851,6927-WTFIV,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),71.30,1389.20,Fiber optic,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,5852,4118-CEVPF,2017-03-01,,One year,Yes,Bank transfer (automatic),110.80,3836.30,Fiber optic,...,4.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,5853,3398-ZOUAA,2018-01-01,2019-10-01,Month-to-month,Yes,Electronic check,69.10,1474.75,Fiber optic,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,5854,9114-VEPUF,2014-12-01,,One year,No,Electronic check,96.10,6001.45,Fiber optic,...,3.0,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,5855,7876-BEUTG,2018-11-01,,Month-to-month,Yes,Mailed check,48.80,720.10,DSL,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,5846,4581-SSPWD,2019-10-01,2020-01-01,Month-to-month,Yes,Electronic check,75.80,246.30,Fiber optic,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7039,5847,3370-HXOPH,2019-11-01,,Month-to-month,No,Electronic check,76.10,257.60,Fiber optic,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,5848,9391-YZEJW,2014-12-01,,One year,Yes,Bank transfer (automatic),94.00,5757.20,Fiber optic,...,3.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7041,5849,9958-MEKUC,2014-02-01,,Two year,No,Credit card (automatic),103.95,7517.70,Fiber optic,...,4.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [54]:
df.columns

Index(['id', 'customer_id', 'begin_date', 'end_date', 'type',
       'paperless_billing', 'payment_method', 'monthly_charges',
       'total_charges', 'internet_service', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines',
       'target', 'num__spl__monthly_charges_sp_0',
       'num__spl__monthly_charges_sp_1', 'num__spl__monthly_charges_sp_2',
       'num__spl__monthly_charges_sp_3', 'num__spl__monthly_charges_sp_4',
       'num__spl__monthly_charges_sp_5', 'num__spl__total_charges_sp_0',
       'num__spl__total_charges_sp_1', 'num__spl__total_charges_sp_2',
       'num__spl__total_charges_sp_3', 'num__spl__total_charges_sp_4',
       'num__spl__total_charges_sp_5', 'num__q__monthly_charges',
       'num__q__total_charges', 'num__rb__monthly_charges',
       'num__rb__total_charges', 'num__pol__1', 'num__pol__monthly_charges',
       'num__pol

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.models import infer_signature


EXPERIMENT_NAME = 'churn_volkovandrey_test'
RUN_NAME = "preprocessing_new" 
REGISTRY_MODEL_NAME = 'new_model'

model = LogisticRegression(max_iter=1000, penalty='l2', solver='lbfgs')
# Добавляем гиперпараметр C, влияющий на сходимость алгоритма
model.C = 1.0

X = df.drop(['id', 'customer_id', 'begin_date', 'end_date', 'type',
       'paperless_billing', 'payment_method', 'monthly_charges',
       'total_charges', 'internet_service', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines',
       'target'], axis=1)
y = df['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

# Логирование модели в MLflow
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # Обучение модели
    model.fit(X_train, y_train)

    # Оценка качества модели на валидационной выборке
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    # Логирование метрик
    mlflow.log_metric("accuracy", accuracy)

    # Signature
    signature = infer_signature(X_val, y_val)

    # Sample
    input_example = X_train.sample(n=1)

    # Логирование модели
    # infer_signature Используйте метод, чтобы попытаться определить подпись непосредственно из входных 
    # и выходных данных
    mlflow.sklearn.log_model(model, 
                         artifact_path="classifier",
                         signature=signature,
                         input_example=input_example)

    # Вывод качества модели
    print(f'Accuracy на валидационной выборке: {accuracy:.2f}')


  outputs = _infer_schema(model_output) if model_output is not None else None


Accuracy на валидационной выборке: 0.79
