In [1]:
from sklearn.model_selection import train_test_split
from autofeat import AutoFeatClassifier
import pandas as pd

import os
import mlflow
from mlflow.models import ModelSignature
from mlflow.types import Schema, ColSpec


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

from dotenv import load_dotenv
load_dotenv()

mlflow_s3_endpoint_url = os.getenv('MLFLOW_S3_ENDPOINT_URL')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
os.environ["MLFLOW_S3_ENDPOINT_URL"] = mlflow_s3_endpoint_url 
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id  
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key  

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [3]:
!echo $DB_DESTINATION_USER

mle_20240824_ff21c1bdfa


In [4]:
df = pd.read_csv('data/initial_data.csv')
df

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2553,2016-03-01,,One year,No,Electronic check,26.90,1250.85,,,...,,,,,Female,0,Yes,No,Yes,0
1,5851,2018-04-01,2019-12-01,Month-to-month,No,Credit card (automatic),71.30,1389.20,Fiber optic,No,...,No,No,No,No,Male,1,No,No,No,1
2,5852,2017-03-01,,One year,Yes,Bank transfer (automatic),110.80,3836.30,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Female,1,No,No,Yes,0
3,5853,2018-01-01,2019-10-01,Month-to-month,Yes,Electronic check,69.10,1474.75,Fiber optic,No,...,No,No,No,No,Male,1,Yes,No,No,1
4,5854,2014-12-01,,One year,No,Electronic check,96.10,6001.45,Fiber optic,Yes,...,No,No,No,Yes,Male,0,Yes,No,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,5846,2019-10-01,2020-01-01,Month-to-month,Yes,Electronic check,75.80,246.30,Fiber optic,No,...,No,No,No,No,Female,0,No,No,Yes,1
7028,5847,2019-11-01,,Month-to-month,No,Electronic check,76.10,257.60,Fiber optic,No,...,No,No,No,No,Female,0,No,No,Yes,0
7029,5848,2014-12-01,,One year,Yes,Bank transfer (automatic),94.00,5757.20,Fiber optic,No,...,No,No,Yes,Yes,Female,0,No,No,Yes,0
7030,5849,2014-02-01,,Two year,No,Credit card (automatic),103.95,7517.70,Fiber optic,Yes,...,Yes,Yes,No,Yes,Male,0,Yes,Yes,Yes,0


In [5]:
from sklearn.preprocessing import OneHotEncoder
# Initialize the OneHotEncoder
ohe = OneHotEncoder()

# Fit and transform the data
encoded_df = ohe.fit_transform(df[['paperless_billing', 'payment_method', 
                                   'internet_service', 'online_security',
                                   'online_backup', 'device_protection',
                                   'tech_support', 'streaming_tv',
                                   'streaming_movies', 'gender', 
                                   'senior_citizen', 'partner', 
                                   'dependents', 'multiple_lines']])

# Get the encoded column names
encoded_columns = ohe.get_feature_names_out().tolist()

# Concatenate the original dataframe with the encoded columns
df_encoded = pd.concat([df[["monthly_charges", "total_charges",'target','begin_date','end_date']], pd.DataFrame(encoded_df.toarray(), columns=encoded_columns)], axis=1)

df = df_encoded
df

Unnamed: 0,monthly_charges,total_charges,target,begin_date,end_date,paperless_billing_No,paperless_billing_Yes,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,...,gender_Female,gender_Male,senior_citizen_0,senior_citizen_1,partner_No,partner_Yes,dependents_No,dependents_Yes,multiple_lines_No,multiple_lines_Yes
0,26.90,1250.85,0,2016-03-01,,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,71.30,1389.20,1,2018-04-01,2019-12-01,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,110.80,3836.30,0,2017-03-01,,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
3,69.10,1474.75,1,2018-01-01,2019-10-01,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,96.10,6001.45,0,2014-12-01,,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,75.80,246.30,1,2019-10-01,2020-01-01,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7028,76.10,257.60,0,2019-11-01,,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7029,94.00,5757.20,0,2014-12-01,,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7030,103.95,7517.70,0,2014-02-01,,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [6]:
df.columns

Index(['monthly_charges', 'total_charges', 'target', 'begin_date', 'end_date',
       'paperless_billing_No', 'paperless_billing_Yes',
       'payment_method_Bank transfer (automatic)',
       'payment_method_Credit card (automatic)',
       'payment_method_Electronic check', 'payment_method_Mailed check',
       'internet_service_DSL', 'internet_service_Fiber optic',
       'internet_service_nan', 'online_security_No', 'online_security_Yes',
       'online_security_nan', 'online_backup_No', 'online_backup_Yes',
       'online_backup_nan', 'device_protection_No', 'device_protection_Yes',
       'device_protection_nan', 'tech_support_No', 'tech_support_Yes',
       'tech_support_nan', 'streaming_tv_No', 'streaming_tv_Yes',
       'streaming_tv_nan', 'streaming_movies_No', 'streaming_movies_Yes',
       'streaming_movies_nan', 'gender_Female', 'gender_Male',
       'senior_citizen_0', 'senior_citizen_1', 'partner_No', 'partner_Yes',
       'dependents_No', 'dependents_Yes', 'multiple_lin

In [7]:
df['target'].value_counts()

target
0    5163
1    1869
Name: count, dtype: int64

In [8]:

cat_features = [
'paperless_billing_No',
       'paperless_billing_Yes', 'payment_method_Bank transfer (automatic)',
       'payment_method_Credit card (automatic)',
       'payment_method_Electronic check', 'payment_method_Mailed check',
       'internet_service_DSL', 'internet_service_Fiber optic',
       'internet_service_nan', 'online_security_No', 'online_security_Yes',
       'online_security_nan', 'online_backup_No', 'online_backup_Yes',
       'online_backup_nan', 'device_protection_No', 'device_protection_Yes',
       'device_protection_nan', 'tech_support_No', 'tech_support_Yes',
       'tech_support_nan', 'streaming_tv_No', 'streaming_tv_Yes',
       'streaming_tv_nan', 'streaming_movies_No', 'streaming_movies_Yes',
       'streaming_movies_nan', 'gender_Female', 'gender_Male',
       'senior_citizen_0', 'senior_citizen_1', 'partner_No', 'partner_Yes',
       'dependents_No', 'dependents_Yes', 'multiple_lines_No',
       'multiple_lines_Yes',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features
target = ['target']

split_column = "begin_date"
test_size = 0.2

# Разделение данных на обучающий и тестовый наборы
df = df.sort_values(by=[split_column])
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)
# Проверка на NaN в x_train и x_test
if y_train.isnull().any().any():
    print("x_train содержит NaN значения. Обработка пропущенных значений...")
    X_train = X_train.fillna(X_train.mean())  # Заполнение NaN средним значением

if y_test.isnull().any().any():
    print("y_test содержит NaN значения. Обработка пропущенных значений...")
    X_test = X_test.fillna(X_test.mean())  # Заполнение NaN средним значением
    
# Проверка на NaN в y_train и y_test
if y_train.isnull().any().any():
    print("y_train содержит NaN значения. Обработка пропущенных значений...")
    y_train = y_train.fillna(y_train.mean())  # Заполнение NaN средним значением

if y_test.isnull().any().any():
    print("y_test содержит NaN значения. Обработка пропущенных значений...")
    y_test = y_test.fillna(y_test.mean())  # Заполнение NaN средним значением

In [9]:
y_train.value_counts()

target
0         4429
1         1196
Name: count, dtype: int64

In [10]:
missing_values = X_train.isnull().sum()
print(missing_values)

paperless_billing_No                        0
paperless_billing_Yes                       0
payment_method_Bank transfer (automatic)    0
payment_method_Credit card (automatic)      0
payment_method_Electronic check             0
payment_method_Mailed check                 0
internet_service_DSL                        0
internet_service_Fiber optic                0
internet_service_nan                        0
online_security_No                          0
online_security_Yes                         0
online_security_nan                         0
online_backup_No                            0
online_backup_Yes                           0
online_backup_nan                           0
device_protection_No                        0
device_protection_Yes                       0
device_protection_nan                       0
tech_support_No                             0
tech_support_Yes                            0
tech_support_nan                            0
streaming_tv_No                   

In [11]:
# Drop rows with any missing values
X_train_clean = X_train.dropna()

# Fill missing values with the mean of each column
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)

In [12]:
# Определение трансформаций
transformations = ('1/', 'log', 'abs', 'sqrt')

# Создание экземпляра AutoFeatRegressor
afc = AutoFeatClassifier(categorical_cols=cat_features, transformations=transformations, feateng_steps=1, n_jobs=-1)

# Применение автоэнкодера признаков для данных для обучения
X_train_features = afc.fit_transform(X_train, y_train)

# Применение автоэнкодера признаков для данных для валидации/теста
X_test_features = afc.transform(X_test)

  y = column_or_1d(y, warn=True)


In [None]:
# artifact_path = "afc"
# experiment_id = mlflow.get_experiment_by_name("churn_volkovandrey_test").experiment_id
# print('experinment id: ',experiment_id)

# with mlflow.start_run(run_name="afc", experiment_id=experiment_id) as run:
#     run_id = run.info.run_id
#     print('run_id:',run_id)
#     afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)

In [14]:
df

Unnamed: 0,monthly_charges,total_charges,target,begin_date,end_date,paperless_billing_No,paperless_billing_Yes,payment_method_Bank transfer (automatic),payment_method_Credit card (automatic),payment_method_Electronic check,...,gender_Female,gender_Male,senior_citizen_0,senior_citizen_1,partner_No,partner_Yes,dependents_No,dependents_Yes,multiple_lines_No,multiple_lines_Yes
5710,92.45,6440.25,1,2013-10-01,2019-10-01,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4642,104.15,7689.95,1,2013-10-01,2019-10-01,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
5806,117.80,8684.80,1,2013-10-01,2019-10-01,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2231,108.05,7532.15,1,2013-11-01,2019-10-01,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4245,108.60,7690.90,1,2013-11-01,2019-10-01,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,44.05,44.05,0,2020-01-01,,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
385,19.25,19.25,0,2020-01-01,,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3114,45.55,45.55,0,2020-01-01,,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1582,44.80,44.80,0,2020-01-01,,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [15]:
df.columns

Index(['monthly_charges', 'total_charges', 'target', 'begin_date', 'end_date',
       'paperless_billing_No', 'paperless_billing_Yes',
       'payment_method_Bank transfer (automatic)',
       'payment_method_Credit card (automatic)',
       'payment_method_Electronic check', 'payment_method_Mailed check',
       'internet_service_DSL', 'internet_service_Fiber optic',
       'internet_service_nan', 'online_security_No', 'online_security_Yes',
       'online_security_nan', 'online_backup_No', 'online_backup_Yes',
       'online_backup_nan', 'device_protection_No', 'device_protection_Yes',
       'device_protection_nan', 'tech_support_No', 'tech_support_Yes',
       'tech_support_nan', 'streaming_tv_No', 'streaming_tv_Yes',
       'streaming_tv_nan', 'streaming_movies_No', 'streaming_movies_Yes',
       'streaming_movies_nan', 'gender_Female', 'gender_Male',
       'senior_citizen_0', 'senior_citizen_1', 'partner_No', 'partner_Yes',
       'dependents_No', 'dependents_Yes', 'multiple_lin

In [16]:

from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
df["monthly_charges"] = scaler.fit_transform(df["monthly_charges"].values.reshape(-1, 1))

df["total_charges"] = scaler.fit_transform(df["total_charges"].values.reshape(-1, 1))

In [17]:
# Save X to a CSV file
X = df.drop(['begin_date', 'end_date', 'target'], axis=1)
X.to_csv('data/X_autofeat.csv', index=False)

# Save y to a CSV file
y = df['target']
y.to_csv('data/y_autofeat.csv', index=False)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.models import infer_signature


EXPERIMENT_NAME = 'churn_volkovandrey_test'
RUN_NAME = "afc_regr_model" 
REGISTRY_MODEL_NAME = 'new_model2'

model = LogisticRegression(max_iter=1000, penalty='l2', solver='lbfgs')
# Добавляем гиперпараметр C, влияющий на сходимость алгоритма
model.C = 1.0

X = df.drop(['begin_date', 'end_date', 'target'], axis=1)
y = df['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

# Логирование модели в MLflow
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # Обучение модели
    model.fit(X_train, y_train)

    # Оценка качества модели на валидационной выборке
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    # Логирование метрик
    mlflow.log_metric("accuracy", accuracy)

    # Signature
    signature = infer_signature(X_val, y_val)

    # Sample
    input_example = X_train.sample(n=1)

    # Логирование модели
    # infer_signature Используйте метод, чтобы попытаться определить подпись непосредственно из входных 
    # и выходных данных
    mlflow.sklearn.log_model(model, 
                         artifact_path="classifier",
                         signature=signature,
                         input_example=input_example)

    # Вывод качества модели
    print(f'Accuracy на валидационной выборке: {accuracy:.2f}')


  outputs = _infer_schema(model_output) if model_output is not None else None


Accuracy на валидационной выборке: 0.79
