In [38]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

TABLE_NAME = "users_churn"# таблица с данными

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn" # напишите название вашего эксперимента
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "churn_model_alexandervasilev"# название зарегистрированной модели 



In [39]:
import os, psycopg
from dotenv import load_dotenv
load_dotenv()

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df[:2]

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2043,7361-YPXFS,2017-10-01,NaT,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,...,Yes,Yes,No,No,Female,1,No,No,Yes,0
1,2044,6557-BZXLQ,2018-10-01,NaT,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,...,No,No,No,No,Male,1,No,No,No,0


In [40]:
from sklearn.impute import SimpleImputer

num_columns = ["monthly_charges", "total_charges"]
cat_columns = ["type", "payment_method", "internet_service", "gender"]

df[num_columns] = SimpleImputer(strategy='mean').fit_transform(df[num_columns])

n_knots = 3
degree_spline = 4
n_quantiles = 100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoder_rb = RobustScaler()
encoder_pol = PolynomialFeatures(degree=degree)
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)

encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')

In [43]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)

categorical_transformer = Pipeline(steps=[('encoder', encoder_oh)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns)
    ], n_jobs=-1
)

encoded_features = preprocessor.fit_transform(df)
transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

#transformed_df.head(10)
new_df = pd.concat([df, transformed_df], axis=1)
pd.set_option('display.max_columns', None)  # Отображать все колонки
new_df.head(10)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,num__spl__monthly_charges_sp_0,num__spl__monthly_charges_sp_1,num__spl__monthly_charges_sp_2,num__spl__monthly_charges_sp_3,num__spl__monthly_charges_sp_4,num__spl__monthly_charges_sp_5,num__spl__total_charges_sp_0,num__spl__total_charges_sp_1,num__spl__total_charges_sp_2,num__spl__total_charges_sp_3,num__spl__total_charges_sp_4,num__spl__total_charges_sp_5,num__q__monthly_charges,num__q__total_charges,num__rb__monthly_charges,num__rb__total_charges,num__pol__1,num__pol__monthly_charges,num__pol__total_charges,num__pol__monthly_charges^2,num__pol__monthly_charges total_charges,num__pol__total_charges^2,num__pol__monthly_charges^3,num__pol__monthly_charges^2 total_charges,num__pol__monthly_charges total_charges^2,num__pol__total_charges^3,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male
0,2043,7361-YPXFS,2017-10-01,NaT,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,No,Yes,Yes,Yes,No,No,Female,1,No,No,Yes,0,2e-06,0.056804,0.496757,0.416666,0.0297723,0.0,0.004502,0.232796,0.595606,0.165716,0.001381,0.0,0.440187,0.577099,-0.108556,0.138002,1.0,64.45,1867.6,4153.8025,120366.82,3487930.0,267712.6,7757642.0,224797100.0,6514058000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2044,6557-BZXLQ,2018-10-01,NaT,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,No,No,No,No,No,No,Male,1,No,No,No,0,0.0,0.037981,0.446766,0.469639,0.04561382,1.142973e-08,0.014163,0.332225,0.55675,0.096732,0.0001302212,0.0,0.48256,0.42082,-0.012879,-0.105559,1.0,69.65,1043.3,4851.1225,72665.845,1088475.0,337880.7,5061176.0,75812280.0,1135606000.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,2045,2550-QHZGP,2019-07-01,NaT,One year,No,Mailed check,19.5,128.6,,,,,,,,Male,0,No,No,No,0,0.037673,0.445748,0.470609,0.04597,1.595455e-08,0.0,0.037601,0.445511,0.470835,0.046053,1.71808e-08,0.0,0.030303,0.128695,-0.935603,-0.37583,1.0,19.5,128.6,380.25,2507.7,16537.96,7414.875,48900.15,322490.2,2126782.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,2046,7519-JTWQH,2014-05-01,NaT,Two year,Yes,Bank transfer (automatic),110.5,7455.45,Fiber optic,No,Yes,Yes,Yes,Yes,Yes,Female,0,No,No,Yes,0,0.0,3e-05,0.076385,0.531653,0.3715967,0.02033488,0.0,0.00027,0.111803,0.570269,0.3066895,0.010968,0.972832,0.971421,0.73873,1.789075,1.0,110.5,7455.45,12210.25,823827.225,55583730.0,1349233.0,91032910.0,6142003000.0,414401800000.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2047,2538-OIMXF,2020-01-01,NaT,Month-to-month,No,Mailed check,24.7,24.7,DSL,No,No,No,No,No,No,Female,0,No,Yes,,0,0.024051,0.391047,0.517404,0.067486,1.131054e-05,0.0,0.04144,0.457652,0.459014,0.041894,1.432327e-13,0.0,0.189033,0.023407,-0.839926,-0.40653,1.0,24.7,24.7,610.09,610.09,610.09,15069.22,15069.22,15069.22,15069.22,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,2048,8543-MSDMF,2019-11-01,NaT,Month-to-month,Yes,Electronic check,77.4,206.15,DSL,Yes,No,Yes,No,Yes,Yes,Male,0,No,No,No,0,0.0,0.019105,0.364548,0.536516,0.07979011,4.100195e-05,0.034914,0.436287,0.479445,0.049353,1.456293e-07,0.0,0.58461,0.169597,0.129715,-0.352916,1.0,77.4,206.15,5990.76,15956.01,42497.82,463684.8,1234995.0,3289331.0,8760926.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6,2049,9961-JBNMK,2018-03-01,2019-12-01,Month-to-month,Yes,Bank transfer (automatic),96.8,2030.3,Fiber optic,No,No,Yes,No,Yes,Yes,Male,1,No,No,No,1,0.0,0.001517,0.169995,0.596467,0.2278294,0.0041917,0.003433,0.214624,0.598159,0.181848,0.001935148,0.0,0.838699,0.596864,0.486661,0.186076,1.0,96.8,2030.3,9370.24,196533.04,4122118.0,907039.2,19024400.0,399021000.0,8369136000.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7,2050,1170-SASML,2014-05-01,NaT,Month-to-month,Yes,Bank transfer (automatic),85.4,5869.4,Fiber optic,No,Yes,Yes,No,No,No,Female,0,Yes,No,Yes,0,0.0,0.008084,0.278785,0.582393,0.1302051,0.0005330787,0.0,0.007427,0.271519,0.585067,0.1353605,0.000627,0.697811,0.893245,0.276909,1.320436,1.0,85.4,5869.4,7293.16,501246.76,34449860.0,622835.9,42806470.0,2942018000.0,202200000000.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,2051,4872-JCVCA,2014-03-01,NaT,Two year,Yes,Bank transfer (automatic),47.6,3377.8,DSL,Yes,Yes,Yes,Yes,No,No,Female,0,Yes,No,,0,0.001247,0.161238,0.594552,0.238113,0.004849276,0.0,0.000106,0.093231,0.553054,0.338561,0.01504783,0.0,0.307644,0.718938,-0.418583,0.584229,1.0,47.6,3377.8,2265.76,160783.28,11409530.0,107850.2,7653284.0,543093800.0,38539120000.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2052,5346-BZCHP,2014-05-01,NaT,Two year,No,Mailed check,19.4,1346.2,,,,,,,,Female,0,Yes,Yes,No,0,0.037981,0.446766,0.469639,0.045614,1.142973e-08,0.0,0.009646,0.294605,0.575871,0.11951,0.000366979,0.0,0.023569,0.487423,-0.937443,-0.016059,1.0,19.4,1346.2,376.36,26116.28,1812254.0,7301.384,506655.8,35157740.0,2439657000.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [35]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer") 



In [36]:
run_id

'5005dc7a5b3a40fdbea5078f91d5ee25'