# Импорт библиотек

In [1]:
import joblib
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import os

In [26]:
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    confusion_matrix,
)
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import hamming_loss


In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
RANDOM_STATE = 1206

# Загрузка данных

## Справочная информация

In [5]:
file_path = "../data/mats/base_scores_dict.joblib"

if os.path.isfile(file_path):
    base_scores_dict = joblib.load(file_path)


In [6]:
base_scores_dict.head()

Unnamed: 0,item,base_score
0,1T,0.4
1,1S,0.4
2,1Lo,0.5
3,1Eu,0.5
4,1F,0.5


In [7]:
file_path = "../data/mats/nlp_dict.joblib"

if os.path.isfile(file_path):
    nlp_dict = joblib.load(file_path)


In [8]:
nlp_dict.head()

Unnamed: 0,item,nlp_item,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9
0,1T,Single Toeloop,3.158019,-1.617936,0.396813,-1.733884,1.948211,0.76528,-4.662311,2.644467,1.162508,0.099441
1,1S,Single Salchow,3.601343,-2.698863,-0.136379,-0.407949,-1.530264,-2.107504,-2.9064,1.774684,0.18298,-3.096374
2,1Lo,Single Loop,5.122801,-4.158421,-2.667444,0.345399,-0.499262,0.415096,-2.308891,3.640449,-1.090383,2.599226
3,1Eu,Single Euler (only used in jump combinations),0.762735,-3.136916,-0.003426,0.913632,-0.219761,-0.314236,0.426446,-0.346271,2.761745,0.762287
4,1F,Single Flip,4.872679,-4.149351,-1.611438,0.783791,0.61482,-0.363553,-3.880303,1.822208,-0.969,-0.667879


## Данные для моделирования

In [9]:
file_path = "../data/processed/final_data.joblib"

if os.path.isfile(file_path):
    final_data = joblib.load(file_path)


In [10]:
final_data.shape

(136126, 62)

In [11]:
final_data.columns

Index(['id', 'total_score_id', 'title', 'decrease', 'base_score', 'goe',
       'avg_score', 'unit_id', 'tournament_id', 'base_score_total_scores',
       'components_score', 'total_score', 'elements_score',
       'decreasings_score', 'starting_place', 'place', 'segment_name', 'info',
       'overall_place', 'overall_total_score', 'overall_place_str', 'color',
       'school_id', 'date_start', 'date_end', 'origin_id', 'sequences',
       'cascade', 'title_nlp', 'cascade_nlp', 'multiply',
       'tournament_duration', 'start_month', 'end_month', 'start_day_of_week',
       'end_day_of_week', 'start_is_weekend', 'end_is_weekend', 'start_season',
       'end_season', 'tournament_year', 'units_with_experience', 'falls',
       'components_score_per_element', 'custom_base_score',
       'avg_overall_place_last_year', 'avg_overall_total_score_last_year',
       'avg_components_score_last_year', 'avg_place_last_year',
       'avg_elements_score_last_year', 'avg_decreasings_score_last_year',


In [12]:
file_path = "../data/processed/final_elements_data.joblib"

if os.path.isfile(file_path):
    final_elements_data = joblib.load(file_path)


In [13]:
final_elements_data.shape

(168745, 92)

## Валидационные данные

In [14]:
file_path = "../data/processed/valid_unit_data.joblib"

if os.path.isfile(file_path):
    valid_unit_data = joblib.load(file_path)


In [15]:
file_path = "../data/processed/test_unit_data.joblib"

if os.path.isfile(file_path):
    test_unit_data = joblib.load(file_path)


In [16]:
file_path = "../data/processed/final_test_unit_data.joblib"

if os.path.isfile(file_path):
    final_test_unit_data = joblib.load(file_path)


In [17]:
file_path = "../data/processed/valid_unit_elements_data.joblib"

if os.path.isfile(file_path):
    valid_unit_elements_data = joblib.load(file_path)


In [18]:
file_path = "../data/processed/test_unit_elements_data.joblib"

if os.path.isfile(file_path):
    test_unit_elements_data = joblib.load(file_path)


In [19]:
file_path = "../data/processed/final_test_unit_elements_data.joblib"

if os.path.isfile(file_path):
    final_test_unit_elements_data = joblib.load(file_path)


In [46]:
file_path = "../data/temp/results_model_two.joblib"

if os.path.isfile(file_path):
    results_model_two = joblib.load(file_path)


# Описание идеи

Третья модель должна предсказывать ошибки на основании предсказанного GOE. Мультиклассовая классификация

# Моделирование на элементах

## Исходные GOE

In [20]:
final_elements_data.columns

Index(['id', 'total_score_id', 'title', 'decrease', 'base_score', 'goe',
       'avg_score', 'unit_id', 'tournament_id', 'base_score_total_scores',
       'components_score', 'total_score', 'elements_score',
       'decreasings_score', 'starting_place', 'place', 'segment_name', 'info',
       'overall_place', 'overall_total_score', 'overall_place_str', 'color',
       'school_id', 'date_start', 'date_end', 'origin_id', 'sequences',
       'cascade', 'title_nlp', 'cascade_nlp', 'multiply',
       'tournament_duration', 'start_month', 'end_month', 'start_day_of_week',
       'end_day_of_week', 'start_is_weekend', 'end_is_weekend', 'start_season',
       'end_season', 'tournament_year', 'units_with_experience', 'falls',
       'components_score_per_element', 'custom_base_score',
       'avg_overall_place_last_year', 'avg_overall_total_score_last_year',
       'avg_components_score_last_year', 'avg_place_last_year',
       'avg_elements_score_last_year', 'avg_decreasings_score_last_year',


In [21]:
data_e8 = final_elements_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        # "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
        "element",
        # "attr_element",
        "prev_element",
        # "attr_prev_element",
        "next_element",
        # "attr_next_element",
        "single_element",
        # "clear_prev_element",
        # "clear_next_element",
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
        "perfect_attr_prev_element",
        "q_attr_prev_element",
        "e_attr_prev_element",
        "l_attr_prev_element",
        "ll_attr_prev_element",
        "h_attr_prev_element",
        "v_attr_prev_element",
        "perfect_attr_next_element",
        "q_attr_next_element",
        "e_attr_next_element",
        "l_attr_next_element",
        "ll_attr_next_element",
        "h_attr_next_element",
        "v_attr_next_element",
    ]
].copy()
data_e8.shape

(168745, 52)

In [22]:
data_e8.duplicated(keep="first").sum()

791

### Разделение данных

In [24]:
X = data_e8.drop(
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ],
    axis=1,
)
y = data_e8[
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126558, 45)
(126558, 7)
(42187, 45)
(42187, 7)


### Моделирование

In [25]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    # "cascade",
    # "title_nlp",
    # "cascade_nlp",
    "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
    "element",
    # "attr_element",
    "prev_element",
    # "attr_prev_element",
    "next_element",
    # "attr_next_element",
    "single_element",
    # "clear_prev_element",
    # "clear_next_element",
    # "perfect_attr_element",
    # "q_attr_element",
    # "e_attr_element",
    # "l_attr_element",
    # "ll_attr_element",
    # "h_attr_element",
    # "v_attr_element",
    "perfect_attr_prev_element",
    "q_attr_prev_element",
    "e_attr_prev_element",
    "l_attr_prev_element",
    "ll_attr_prev_element",
    "h_attr_prev_element",
    "v_attr_prev_element",
    "perfect_attr_next_element",
    "q_attr_next_element",
    "e_attr_next_element",
    "l_attr_next_element",
    "ll_attr_next_element",
    "h_attr_next_element",
    "v_attr_next_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

In [27]:
# Инициализация базового классификатора

base_classifier_e8 = CatBoostClassifier(
    verbose=50,
    n_estimators=50,
    cat_features=category_columns,
    random_state=RANDOM_STATE,
)

# Инициализация мультилейбл классификатора
multi_target_classifier_e8 = MultiOutputClassifier(base_classifier_e8, n_jobs=-1)

# Обучение модели
multi_target_classifier_e8.fit(X_train, y_train)

In [28]:
y_pred_multi_target_e8 = multi_target_classifier_e8.predict(X_test)

In [29]:
y_proba_multi_target_e8 = multi_target_classifier_e8.predict_proba(X_test)

In [30]:
print(classification_report(y_test, y_pred_multi_target_e8))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     33340
           1       0.69      0.33      0.44      2075
           2       0.55      0.08      0.14       206
           3       0.78      0.77      0.77      3699
           4       0.90      0.70      0.79      1712
           5       0.83      0.50      0.62      1137
           6       1.00      1.00      1.00       461

   micro avg       0.97      0.92      0.94     42630
   macro avg       0.82      0.63      0.68     42630
weighted avg       0.95      0.92      0.93     42630
 samples avg       0.92      0.92      0.92     42630



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
hamming_loss_score = hamming_loss(y_test, y_pred_multi_target_e8)
print(f"Hamming Loss: {hamming_loss_score}")

Hamming Loss: 0.016589402964352594


In [32]:
# Получение важности признаков для каждого классификатора
feature_importances = []

for idx, classifier in enumerate(multi_target_classifier_e8.estimators_):
    importances = classifier.get_feature_importance()
    feature_importances.append(importances)
    print(f"Feature importances for label {idx}: {importances}")

# Рассчет среднего значения важности признаков по всем меткам
average_feature_importances = np.mean(feature_importances, axis=0)

# Создание DataFrame для отображения важности признаков
feature_importance_df = pd.DataFrame(
    {"feature": X_train.columns, "importance": average_feature_importances}
).sort_values(by="importance", ascending=False)

feature_importance_df

Feature importances for label 0: [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 3.68262382e-01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.78187679e-01
 3.42139701e-01 6.29488498e-02 3.06448636e-01 9.04736449e-02
 0.00000000e+00 9.86515391e+01 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
Feature importances for label 1: [7.84778958e+00 1.39593455e-02 0.00000000e+00 3.66154755e-02
 6.81805541e-02 7.21957328e-01 1.19866151e-01 8.91649949e-03
 1.03445191e-04 1.94401646e-02 2.51946122e-02 0.00000000e+00
 9.52522368e-02 2.66434017e-02 8.03664220e-07 7.22591167e-02
 1

Unnamed: 0,feature,importance
25,target_clear_element,63.32049
26,difficulty,9.267289
0,goe,8.205073
27,element,6.013262
5,multiply,4.241507
28,prev_element,1.378394
18,avg_overall_total_score_last_year,1.204722
16,falls,1.08634
20,avg_place_last_year,0.972514
22,avg_decreasings_score_last_year,0.729702


## Без GOE

In [33]:
data_e9 = final_elements_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        # "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        # "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        # "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        # "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
        "element",
        # "attr_element",
        "prev_element",
        # "attr_prev_element",
        "next_element",
        # "attr_next_element",
        "single_element",
        # "clear_prev_element",
        # "clear_next_element",
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
        "perfect_attr_prev_element",
        "q_attr_prev_element",
        "e_attr_prev_element",
        "l_attr_prev_element",
        "ll_attr_prev_element",
        "h_attr_prev_element",
        "v_attr_prev_element",
        "perfect_attr_next_element",
        "q_attr_next_element",
        "e_attr_next_element",
        "l_attr_next_element",
        "ll_attr_next_element",
        "h_attr_next_element",
        "v_attr_next_element",
    ]
].copy()
data_e9.shape

(168745, 49)

In [34]:
data_e9.duplicated(keep="first").sum()

5038

### Разделение данных

In [35]:
X = data_e9.drop(
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ],
    axis=1,
)
y = data_e9[
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126558, 42)
(126558, 7)
(42187, 42)
(42187, 7)


### Моделирование

In [36]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    # "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    # "cascade",
    # "title_nlp",
    # "cascade_nlp",
    "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    # "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
    "element",
    # "attr_element",
    "prev_element",
    # "attr_prev_element",
    "next_element",
    # "attr_next_element",
    "single_element",
    # "clear_prev_element",
    # "clear_next_element",
    # "perfect_attr_element",
    # "q_attr_element",
    # "e_attr_element",
    # "l_attr_element",
    # "ll_attr_element",
    # "h_attr_element",
    # "v_attr_element",
    "perfect_attr_prev_element",
    "q_attr_prev_element",
    "e_attr_prev_element",
    "l_attr_prev_element",
    "ll_attr_prev_element",
    "h_attr_prev_element",
    "v_attr_prev_element",
    "perfect_attr_next_element",
    "q_attr_next_element",
    "e_attr_next_element",
    "l_attr_next_element",
    "ll_attr_next_element",
    "h_attr_next_element",
    "v_attr_next_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

In [37]:
# Инициализация базового классификатора

base_classifier_e9 = CatBoostClassifier(
    verbose=50,
    n_estimators=50,
    cat_features=category_columns,
    random_state=RANDOM_STATE,
)

# Инициализация мультилейбл классификатора
multi_target_classifier_e9 = MultiOutputClassifier(base_classifier_e9, n_jobs=-1)

# Обучение модели
multi_target_classifier_e9.fit(X_train, y_train)

In [38]:
y_pred_multi_target_e9 = multi_target_classifier_e9.predict(X_test)

In [39]:
y_proba_multi_target_e9 = multi_target_classifier_e9.predict_proba(X_test)

In [40]:
print(classification_report(y_test, y_pred_multi_target_e9))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91     33340
           1       0.19      0.00      0.00      2075
           2       0.00      0.00      0.00       206
           3       0.47      0.09      0.15      3699
           4       0.46      0.10      0.16      1712
           5       0.09      0.00      0.01      1137
           6       0.41      0.11      0.17       461

   micro avg       0.85      0.76      0.80     42630
   macro avg       0.35      0.18      0.20     42630
weighted avg       0.75      0.76      0.73     42630
 samples avg       0.77      0.77      0.77     42630



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Показатели снизились, что ожидаемо. Оценим работу с расчетным GOE от Модели № 2.

## С расчетным GOE

In [41]:
data_e10 = final_elements_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        # "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        # "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        # "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
        "element",
        # "attr_element",
        "prev_element",
        # "attr_prev_element",
        "next_element",
        # "attr_next_element",
        "single_element",
        # "clear_prev_element",
        # "clear_next_element",
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
        "perfect_attr_prev_element",
        "q_attr_prev_element",
        "e_attr_prev_element",
        "l_attr_prev_element",
        "ll_attr_prev_element",
        "h_attr_prev_element",
        "v_attr_prev_element",
        "perfect_attr_next_element",
        "q_attr_next_element",
        "e_attr_next_element",
        "l_attr_next_element",
        "ll_attr_next_element",
        "h_attr_next_element",
        "v_attr_next_element",
    ]
].copy()
data_e10.shape

(168745, 50)

In [42]:
data_e10.duplicated(keep="first").sum()

1509

### Разделение данных

In [43]:
X = data_e10.drop(
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ],
    axis=1,
)
y = data_e10[
    [
        "perfect_attr_element",
        "q_attr_element",
        "e_attr_element",
        "l_attr_element",
        "ll_attr_element",
        "h_attr_element",
        "v_attr_element",
    ]
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126558, 43)
(126558, 7)
(42187, 43)
(42187, 7)


### Моделирование

In [44]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    # "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    # "cascade",
    # "title_nlp",
    # "cascade_nlp",
    "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    # "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
    "element",
    # "attr_element",
    "prev_element",
    # "attr_prev_element",
    "next_element",
    # "attr_next_element",
    "single_element",
    # "clear_prev_element",
    # "clear_next_element",
    # "perfect_attr_element",
    # "q_attr_element",
    # "e_attr_element",
    # "l_attr_element",
    # "ll_attr_element",
    # "h_attr_element",
    # "v_attr_element",
    "perfect_attr_prev_element",
    "q_attr_prev_element",
    "e_attr_prev_element",
    "l_attr_prev_element",
    "ll_attr_prev_element",
    "h_attr_prev_element",
    "v_attr_prev_element",
    "perfect_attr_next_element",
    "q_attr_next_element",
    "e_attr_next_element",
    "l_attr_next_element",
    "ll_attr_next_element",
    "h_attr_next_element",
    "v_attr_next_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

In [45]:
# Инициализация базового классификатора

base_classifier_e10 = CatBoostClassifier(
    verbose=50,
    n_estimators=50,
    cat_features=category_columns,
    random_state=RANDOM_STATE,
)

# Инициализация мультилейбл классификатора
multi_target_classifier_e10 = MultiOutputClassifier(base_classifier_e10, n_jobs=-1)

# Обучение модели
multi_target_classifier_e10.fit(X_train, y_train)

In [47]:
X_test["goe"] = results_model_two

In [50]:
y_pred_multi_target_e10 = multi_target_classifier_e10.predict(X_test)

In [51]:
y_proba_multi_target_e10 = multi_target_classifier_e10.predict_proba(X_test)

In [53]:
print(classification_report(y_test, y_pred_multi_target_e10))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     33340
           1       0.11      0.01      0.01      2075
           2       0.25      0.00      0.01       206
           3       0.38      0.16      0.23      3699
           4       0.28      0.08      0.13      1712
           5       0.17      0.18      0.17      1137
           6       0.17      0.17      0.17       461

   micro avg       0.81      0.75      0.78     42630
   macro avg       0.32      0.22      0.23     42630
weighted avg       0.73      0.75      0.73     42630
 samples avg       0.75      0.76      0.75     42630



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
hamming_loss_score = hamming_loss(y_test, y_pred_multi_target_e10)
print(f"Hamming Loss: {hamming_loss_score}")

Hamming Loss: 0.061677768032806314


Показатели ухудшились. Необходимо улучшать качество предыдущих моделей.

Сохраним модель для использования.

In [None]:
# joblib.dump(multi_target_classifier_e8, '../models/model_three.joblib')

['temp/model_three.joblib']