# Импорт библиотек

In [5]:
import joblib
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [33]:
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    confusion_matrix,
    mean_squared_error,
    root_mean_squared_error,
)
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

In [7]:
tqdm.pandas()
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
RANDOM_STATE = 1206

# Загрузка данных

## Справочная информация

In [8]:
file_path = "../data/mats/base_scores_dict.joblib"

if os.path.isfile(file_path):
    base_scores_dict = joblib.load(file_path)


In [9]:
base_scores_dict.head()

Unnamed: 0,item,base_score
0,1T,0.4
1,1S,0.4
2,1Lo,0.5
3,1Eu,0.5
4,1F,0.5


In [10]:
file_path = "../data/mats/nlp_dict.joblib"

if os.path.isfile(file_path):
    nlp_dict = joblib.load(file_path)


In [11]:
nlp_dict.head()

Unnamed: 0,item,nlp_item,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9
0,1T,Single Toeloop,3.158019,-1.617936,0.396813,-1.733884,1.948211,0.76528,-4.662311,2.644467,1.162508,0.099441
1,1S,Single Salchow,3.601343,-2.698863,-0.136379,-0.407949,-1.530264,-2.107504,-2.9064,1.774684,0.18298,-3.096374
2,1Lo,Single Loop,5.122801,-4.158421,-2.667444,0.345399,-0.499262,0.415096,-2.308891,3.640449,-1.090383,2.599226
3,1Eu,Single Euler (only used in jump combinations),0.762735,-3.136916,-0.003426,0.913632,-0.219761,-0.314236,0.426446,-0.346271,2.761745,0.762287
4,1F,Single Flip,4.872679,-4.149351,-1.611438,0.783791,0.61482,-0.363553,-3.880303,1.822208,-0.969,-0.667879


## Данные для моделирования

In [12]:
file_path = "../data/processed/final_data.joblib"

if os.path.isfile(file_path):
    final_data = joblib.load(file_path)


In [13]:
final_data.shape

(136126, 62)

In [14]:
final_data.columns

Index(['id', 'total_score_id', 'title', 'decrease', 'base_score', 'goe',
       'avg_score', 'unit_id', 'tournament_id', 'base_score_total_scores',
       'components_score', 'total_score', 'elements_score',
       'decreasings_score', 'starting_place', 'place', 'segment_name', 'info',
       'overall_place', 'overall_total_score', 'overall_place_str', 'color',
       'school_id', 'date_start', 'date_end', 'origin_id', 'sequences',
       'cascade', 'title_nlp', 'cascade_nlp', 'multiply',
       'tournament_duration', 'start_month', 'end_month', 'start_day_of_week',
       'end_day_of_week', 'start_is_weekend', 'end_is_weekend', 'start_season',
       'end_season', 'tournament_year', 'units_with_experience', 'falls',
       'components_score_per_element', 'custom_base_score',
       'avg_overall_place_last_year', 'avg_overall_total_score_last_year',
       'avg_components_score_last_year', 'avg_place_last_year',
       'avg_elements_score_last_year', 'avg_decreasings_score_last_year',


In [15]:
file_path = "../data/processed/final_elements_data.joblib"

if os.path.isfile(file_path):
    final_elements_data = joblib.load(file_path)


In [16]:
final_elements_data.shape

(168745, 92)

## Валидационные данные

In [17]:
file_path = "../data/processed/valid_unit_data.joblib"

if os.path.isfile(file_path):
    valid_unit_data = joblib.load(file_path)


In [18]:
file_path = "../data/processed/test_unit_data.joblib"

if os.path.isfile(file_path):
    test_unit_data = joblib.load(file_path)


In [19]:
file_path = "../data/processed/final_test_unit_data.joblib"

if os.path.isfile(file_path):
    final_test_unit_data = joblib.load(file_path)


In [20]:
file_path = "../data/processed/valid_unit_elements_data.joblib"

if os.path.isfile(file_path):
    valid_unit_elements_data = joblib.load(file_path)


In [21]:
file_path = "../data/processed/test_unit_elements_data.joblib"

if os.path.isfile(file_path):
    test_unit_elements_data = joblib.load(file_path)


In [22]:
file_path = "../data/processed/final_test_unit_elements_data.joblib"

if os.path.isfile(file_path):
    final_test_unit_elements_data = joblib.load(file_path)


# Описание идеи

**Вторая модель должна предсказывать значение goe для элемента с ошибкой.**

В случае работы с наборами элементов (последовательности, каскады) будет предсказываться значение GOE для всего набора (дата-сет final_data).

В случае работы поэлементно будет предсказываться значение GOE с учетом его "расположения" в наборе (для этого введены признаки "следующий" и "предыдущий" элемент). 

Результаты работы модели (предсказания) будут использованы в предсказании типов ошибок с использованием третьей модели.

В текущей работе решается задачи регрессии.

# Моделирование

## Наборный подход - дата-сет `final_data`

In [23]:
# В данной модели раскомментируем признак GOe
data_e4 = final_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        # "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
    ]
].copy()
data_e4.shape


(136126, 27)

In [24]:
data_e4.duplicated(keep="first").sum()

736

### Разделение данных

In [25]:
X = data_e4.drop("goe", axis=1)
y = data_e4["goe"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(102094, 26)
(102094,)
(34032, 26)
(34032,)


### Моделирование

In [26]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    "cascade",
    # "title_nlp",
    # "cascade_nlp",
    # "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    # "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

#### Линейная регрессия

In [34]:
preprocessor_lr = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore"), ohe_columns),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_lr = make_pipeline(preprocessor_lr, LinearRegression())


In [35]:
tscv = TimeSeriesSplit(n_splits=3)

In [37]:
cv_result = cross_val_score(
    model_lr,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................




[CV] END ............................... score: (test=-0.628) total time=   1.5s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    1.5s
[CV] START .....................................................................




[CV] END ............................... score: (test=-0.608) total time=   2.9s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    4.5s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.682) total time=   4.5s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    9.1s




In [38]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{cv_result.std():.2f}")


Среднее значение метрики: 0.64
Стандартное отклонение предсказаний: 0.03


#### KNeighborsRegressor

In [40]:
preprocessor_knn = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ohe_columns,
        ),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_knn = make_pipeline(preprocessor_knn, KNeighborsRegressor())


In [41]:
cv_result = cross_val_score(
    model_knn,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
[CV] END ............................... score: (test=-0.786) total time=   0.9s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    1.0s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.738) total time=   1.5s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    2.6s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.827) total time=   2.2s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    4.9s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    4.9s


In [42]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{cv_result.std():.2f}")


Среднее значение метрики: 0.78
Стандартное отклонение предсказаний: 0.04


#### RandomForestRegressor

In [44]:
preprocessor_rfr = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ohe_columns,
        ),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_rfr = make_pipeline(preprocessor_rfr, RandomForestRegressor())


In [45]:
cv_result = cross_val_score(
    model_rfr,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
[CV] END ............................... score: (test=-0.554) total time=  26.1s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   26.1s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.550) total time= 1.0min
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:  1.4min
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.617) total time= 1.6min
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:  3.1min
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:  3.1min


In [48]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{cv_result.std():.2f}")


Среднее значение метрики: 0.57
Стандартное отклонение предсказаний: 0.03


#### CatBoost

In [49]:
model_e4_ctr = CatBoostRegressor(
    verbose=50,
    n_estimators=150,
    cat_features=category_columns,
    random_state=RANDOM_STATE,
)


In [50]:
cv_result = cross_val_score(
    model_e4_ctr,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
Learning rate set to 0.319388
0:	learn: 0.8642599	total: 192ms	remaining: 28.7s
50:	learn: 0.6196413	total: 2.56s	remaining: 4.97s
100:	learn: 0.5912465	total: 4.94s	remaining: 2.4s
149:	learn: 0.5705232	total: 7.31s	remaining: 0us
[CV] END ............................... score: (test=-0.557) total time=   7.7s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    7.7s
[CV] START .....................................................................
Learning rate set to 0.356351
0:	learn: 0.7767365	total: 58ms	remaining: 8.64s
50:	learn: 0.5710224	total: 2.71s	remaining: 5.27s
100:	learn: 0.5486285	total: 5.32s	remaining: 2.58s
149:	learn: 0.5308481	total: 8.12s	remaining: 0us
[CV] END ............................... score: (test=-0.551) total time=   8.6s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:   16.4s
[CV] START .....................................................................
Learning 

In [51]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{(-1)*cv_result.std():.2f}")


Среднее значение метрики: 0.57
Стандартное отклонение предсказаний: -0.03


#### Тестрирование

Для тестирования выберем KNNRegressor

In [63]:
model_knn.fit(X_train, y_train)


In [64]:
y_pred = model_knn.predict(X_test)

In [65]:
rmse_knn = root_mean_squared_error(y_test, y_pred)
rmse_knn


0.5863341123278639

## Элементный подход - дата-сет `final_elements_data`

In [72]:
data_e5_1 = final_elements_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        # "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        # "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
        "element",
        # "attr_element",
        "prev_element",
        "attr_prev_element",
        "next_element",
        "attr_next_element",
        "single_element",
        "clear_prev_element",
        "clear_next_element",
        # "perfect_attr_element",
        # "q_attr_element",
        # "e_attr_element",
        # "l_attr_element",
        # "ll_attr_element",
        # "h_attr_element",
        # "v_attr_element",
        # "perfect_attr_prev_element",
        # "q_attr_prev_element",
        # "e_attr_prev_element",
        # "l_attr_prev_element",
        # "ll_attr_prev_element",
        # "h_attr_prev_element",
        # "v_attr_prev_element",
        # "perfect_attr_next_element",
        # "q_attr_next_element",
        # "e_attr_next_element",
        # "l_attr_next_element",
        # "ll_attr_next_element",
        # "h_attr_next_element",
        # "v_attr_next_element",
    ]
].copy()
data_e5_1.shape

(168745, 34)

In [73]:
data_e5_1.duplicated(keep="first").sum()

828

### Разделение данных

In [74]:
X = data_e5_1.drop("goe", axis=1)
y = data_e5_1["goe"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126558, 33)
(126558,)
(42187, 33)
(42187,)


### Моделирование

In [75]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    # "cascade",
    # "title_nlp",
    # "cascade_nlp",
    # "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    # "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
    "element",
    # "attr_element",
    "prev_element",
    "attr_prev_element",
    "next_element",
    "attr_next_element",
    "single_element",
    "clear_prev_element",
    "clear_next_element",
    # "perfect_attr_element",
    # "q_attr_element",
    # "e_attr_element",
    # "l_attr_element",
    # "ll_attr_element",
    # "h_attr_element",
    # "v_attr_element",
    # "perfect_attr_prev_element",
    # "q_attr_prev_element",
    # "e_attr_prev_element",
    # "l_attr_prev_element",
    # "ll_attr_prev_element",
    # "h_attr_prev_element",
    # "v_attr_prev_element",
    # "perfect_attr_next_element",
    # "q_attr_next_element",
    # "e_attr_next_element",
    # "l_attr_next_element",
    # "ll_attr_next_element",
    # "h_attr_next_element",
    # "v_attr_next_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

#### KNeighborsRegressor

In [76]:
preprocessor_knn = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ohe_columns,
        ),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_knn = make_pipeline(preprocessor_knn, KNeighborsRegressor())


In [77]:
cv_result = cross_val_score(
    model_knn,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
[CV] END ............................... score: (test=-0.824) total time=   1.4s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    1.4s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.747) total time=   2.5s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    4.0s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.850) total time=   3.7s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    7.8s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    7.8s


In [None]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{(-1)*cv_result.std():.2f}")


Среднее значение метрики: 0.57
Стандартное отклонение предсказаний: -0.03


#### Тестрирование

Для тестирования выберем KNNRegressor

In [None]:
model_knn.fit(X_train, y_train)


In [None]:
y_pred = model_knn.predict(X_test)

In [None]:
rmse_knn = root_mean_squared_error(y_test, y_pred)
rmse_knn


0.5863341123278639

## Наборный подход c перфектом - дата-сет `final_data`

Попробуем добавить признак, который, фактически может являться утечкой данных - target_clear. Если элемент выполнен идеально, то и GOE будет выше. Если наоборот - то ниже. Однако, в реальной работе модели мы этот параметр будем предсказывать первой моделью. В этой случае утечки не будет.

In [82]:
data_e6 = final_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
    ]
].copy()
data_e6.shape

(136126, 28)

In [83]:
data_e6.duplicated(keep="first").sum()

714

### Разделение данных

In [84]:
X = data_e6.drop("goe", axis=1)
y = data_e6["goe"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(102094, 27)
(102094,)
(34032, 27)
(34032,)


### Моделирование

In [85]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    "cascade",
    # "title_nlp",
    # "cascade_nlp",
    # "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

#### KNeighborsRegressor

In [90]:
preprocessor_knn = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ohe_columns,
        ),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_knn = make_pipeline(preprocessor_knn, KNeighborsRegressor())


In [91]:
cv_result = cross_val_score(
    model_knn,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
[CV] END ............................... score: (test=-0.775) total time=   0.8s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.8s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.723) total time=   1.6s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    2.5s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.809) total time=   2.2s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    4.8s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    4.8s


In [92]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{(-1)*cv_result.std():.2f}")


Среднее значение метрики: 0.77
Стандартное отклонение предсказаний: -0.04


#### Тестрирование

Для тестирования выберем KNNRegressor

In [93]:
model_knn.fit(X_train, y_train)


In [94]:
y_pred = model_knn.predict(X_test)

In [95]:
rmse_knn = root_mean_squared_error(y_test, y_pred)
rmse_knn


0.5655360846131128

## Элементный подход c перфектом - дата-сет `final_elements_data`

In [96]:
data_e7_1 = final_elements_data[
    [
        # "id",
        # "total_score_id",
        # "title",
        # "decrease",
        # "base_score",
        "goe",
        # "avg_score",
        # "unit_id",
        # "tournament_id",
        # "base_score_total_scores",
        # "components_score",
        # "total_score",
        # "elements_score",
        # "decreasings_score",
        # "starting_place",
        # "place",
        "segment_name",
        # "info",
        # "overall_place",
        # "overall_total_score",
        # "overall_place_str",
        "color",
        "school_id",
        # "date_start",
        # "date_end",
        "origin_id",
        # "sequences",
        # "cascade",
        # "title_nlp",
        # "cascade_nlp",
        "multiply",
        "tournament_duration",
        "start_month",
        "end_month",
        "start_day_of_week",
        "end_day_of_week",
        "start_is_weekend",
        "end_is_weekend",
        "start_season",
        "end_season",
        "tournament_year",
        # "units_with_experience",
        "falls",
        # "components_score_per_element",
        # "custom_base_score",
        "avg_overall_place_last_year",
        "avg_overall_total_score_last_year",
        "avg_components_score_last_year",
        "avg_place_last_year",
        "avg_elements_score_last_year",
        "avg_decreasings_score_last_year",
        "avg_total_score_last_year",
        "avg_falls_last_year",
        "target_clear_element",
        "difficulty",
        # "perfect_element",
        # "q_element",
        # "e_element",
        # "l_element",
        # "ll_element",
        # "h_element",
        # "v_element",
        "element",
        # "attr_element",
        "prev_element",
        "attr_prev_element",
        "next_element",
        "attr_next_element",
        "single_element",
        "clear_prev_element",
        "clear_next_element",
        # "perfect_attr_element",
        # "q_attr_element",
        # "e_attr_element",
        # "l_attr_element",
        # "ll_attr_element",
        # "h_attr_element",
        # "v_attr_element",
        # "perfect_attr_prev_element",
        # "q_attr_prev_element",
        # "e_attr_prev_element",
        # "l_attr_prev_element",
        # "ll_attr_prev_element",
        # "h_attr_prev_element",
        # "v_attr_prev_element",
        # "perfect_attr_next_element",
        # "q_attr_next_element",
        # "e_attr_next_element",
        # "l_attr_next_element",
        # "ll_attr_next_element",
        # "h_attr_next_element",
        # "v_attr_next_element",
    ]
].copy()
data_e7_1.shape

(168745, 35)

In [97]:
data_e7_1.duplicated(keep="first").sum()

803

### Разделение данных

In [98]:
X = data_e7_1.drop("goe", axis=1)
y = data_e7_1["goe"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=False, random_state=1206
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(126558, 34)
(126558,)
(42187, 34)
(42187,)


### Моделирование

In [99]:
category_columns = [
    # "id",
    # "total_score_id",
    # "title",
    # "decrease",
    # "base_score",
    # "goe",
    # "avg_score",
    # "unit_id",
    # "tournament_id",
    # "base_score_total_scores",
    # "components_score",
    # "total_score",
    # "elements_score",
    # "decreasings_score",
    # "starting_place",
    # "place",
    "segment_name",
    # "info",
    # "overall_place",
    # "overall_total_score",
    # "overall_place_str",
    "color",
    "school_id",
    # "date_start",
    # "date_end",
    "origin_id",
    # "sequences",
    # "cascade",
    # "title_nlp",
    # "cascade_nlp",
    # "multiply",
    # "tournament_duration",
    "start_month",
    "end_month",
    "start_day_of_week",
    "end_day_of_week",
    "start_is_weekend",
    "end_is_weekend",
    "start_season",
    "end_season",
    "tournament_year",
    # "units_with_experience",
    "falls",
    # "components_score_per_element",
    # # "custom_base_score",
    # "avg_overall_place_last_year",
    # "avg_overall_total_score_last_year",
    # "avg_components_score_last_year",
    # "avg_place_last_year",
    # "avg_elements_score_last_year",
    # "avg_decreasings_score_last_year",
    # "avg_total_score_last_year",
    # "avg_falls_last_year",
    "target_clear_element",
    # "difficulty",
    # "perfect_element",
    # "q_element",
    # "e_element",
    # "l_element",
    # "ll_element",
    # "h_element",
    # "v_element",
    "element",
    # "attr_element",
    "prev_element",
    "attr_prev_element",
    "next_element",
    "attr_next_element",
    "single_element",
    "clear_prev_element",
    "clear_next_element",
    # "perfect_attr_element",
    # "q_attr_element",
    # "e_attr_element",
    # "l_attr_element",
    # "ll_attr_element",
    # "h_attr_element",
    # "v_attr_element",
    # "perfect_attr_prev_element",
    # "q_attr_prev_element",
    # "e_attr_prev_element",
    # "l_attr_prev_element",
    # "ll_attr_prev_element",
    # "h_attr_prev_element",
    # "v_attr_prev_element",
    # "perfect_attr_next_element",
    # "q_attr_next_element",
    # "e_attr_next_element",
    # "l_attr_next_element",
    # "ll_attr_next_element",
    # "h_attr_next_element",
    # "v_attr_next_element",
]

ohe_columns = [x for x in category_columns if x in X.columns]
scale_columns = [x for x in X.columns if x not in category_columns]

#### KNeighborsRegressor

In [100]:
preprocessor_knn = ColumnTransformer(
    transformers=[
        (
            "ohe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ohe_columns,
        ),
        ("scaler", StandardScaler(), scale_columns),
    ]
)

model_knn = make_pipeline(preprocessor_knn, KNeighborsRegressor())


In [101]:
cv_result = cross_val_score(
    model_knn,
    X_train,
    y_train,
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    verbose=500,
)


[CV] START .....................................................................
[CV] END ............................... score: (test=-0.808) total time=   1.5s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    1.5s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.730) total time=   2.5s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    4.1s
[CV] START .....................................................................
[CV] END ............................... score: (test=-0.830) total time=   3.9s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    8.1s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    8.1s


In [102]:
print("Среднее значение метрики: " f"{(-1)*cv_result.mean():.2f}")
print("Стандартное отклонение предсказаний: " f"{(-1)*cv_result.std():.2f}")


Среднее значение метрики: 0.79
Стандартное отклонение предсказаний: -0.04


#### Тестрирование

Для тестирования выберем KNNRegressor

In [103]:
model_knn.fit(X_train, y_train)


In [104]:
y_pred = model_knn.predict(X_test)

In [105]:
rmse_knn = root_mean_squared_error(y_test, y_pred)
rmse_knn


0.5701795153593854

In [None]:
# joblib.dump(y_pred, "../data/temp/results_model_two.joblib")

['temp/results_model_two.joblib']

In [65]:
# joblib.dump(model_knn, "../models/model_two.joblib")

['temp/model_two_7.joblib']

# Вывод по разработке "Модели № 2"