In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import phik 
from phik.report import plot_correlation_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# варианты моделей
from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from lightautoml import automl
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
# pyboost

# для ансамблирования 
from sklearn.ensemble import VotingClassifier

# заморозка сидов
import torch
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(1001)

In [None]:
df = pd.read_csv('train_1.csv', index_col = 0)
df

In [None]:
df.sort_index(inplace=True)
df

In [None]:
pred_df = pd.read_csv('test_1.csv', index_col = 0)
pred_df

In [None]:
pred_df.sort_index(inplace=True)
pred_df

In [None]:
set(df.columns) - set(pred_df.columns)

In [None]:
print(df.info())
df

In [None]:
df.isna().sum().sum()

In [None]:
pred_df.isna().sum().sum()

In [None]:
# сравнить показатели датафреймов

In [None]:
Counter(df.machine_type), Counter(pred_df.machine_type)

In [None]:
Counter(df['failure_flag'])

In [None]:
le = LabelEncoder()

df['machine_type'] = le.fit_transform(df['machine_type'])
df

In [None]:
pred_df['machine_type'] = le.fit_transform(pred_df['machine_type'])

In [None]:
phik_matrix = df.phik_matrix()

plot_correlation_matrix(phik_matrix.values,
                        x_labels = phik_matrix.columns,
                        y_labels = phik_matrix.index,
                        title = 'phik correlation matrix',
                        fontsize_factor=0.8, figsize=(11, 6)
)

In [None]:
# удалить колонки с слишком высокой корреляцией

In [None]:
phik_matrix = pred_df.phik_matrix()

plot_correlation_matrix(phik_matrix.values,
                        x_labels = phik_matrix.columns,
                        y_labels = phik_matrix.index,
                        title = 'phik correlation matrix',
                        fontsize_factor=0.8, figsize=(11, 6)
)

In [None]:
# сравнить корредяции в датафреймах

In [None]:
X = df.drop(columns = ['failure_flag'])
X

In [None]:
y = df.failure_flag
y

In [None]:
# подобрать метод валидации

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
model1 = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, verbose=0)
model2 = CatBoostClassifier(iterations=300, depth=8, learning_rate=0.05, verbose=0)

In [None]:
model1.fit(X_train, y_train)

In [None]:
feature_importance = model1.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(15, 10))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
# Создание ансамбля с голосованием
model = VotingClassifier(
    estimators=[
        ('cat1', model1), 
        ('cat2', model2)
     ],
    verbose=True,
    # n_jobs = -1,
    voting='hard'  # Можно использовать 'hard' или 'soft'
)

In [None]:
model.fit(X_train, y_train)

In [None]:
f1_score(y_test, model.predict(X_test), average='macro')

In [None]:
pred = model.predict(pred_df)
pred

In [None]:
pred_df['failure_flag'] = pred
pred_df.to_csv('sub.csv', index=False)