# Baseline

In [None]:
!pip install numba
!pip install catboost

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

## Загрузка данных

Обозначение категориальных и числовых признаков

In [8]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_df.select_dtypes(include=['category']).columns

train_df[numerical_features] = train_df[numerical_features].fillna(train_df[numerical_features].median())
test_df[numerical_features] = test_df[numerical_features].fillna(test_df[numerical_features].median())

train_df[categorical_features] = train_df[categorical_features].astype(str)
test_df[categorical_features] = test_df[categorical_features].astype(str)

train_df[categorical_features] = train_df[categorical_features].fillna(train_df[categorical_features].mode())
test_df[categorical_features] = test_df[categorical_features].fillna(test_df[categorical_features].mode())

train_df[categorical_features] = train_df[categorical_features].astype("category")
test_df[categorical_features] = test_df[categorical_features].astype("category")

Создаем выборки для валидации и обучения

In [None]:
X = train_df.drop(['id', 'date', 'end_cluster'], axis=1)
y = train_df["end_cluster"]

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
corr_matrix = X[numerical_features].corr().abs()

# удаляем признаки, у которых корреляция > 0.95
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > 0.95:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

to_drop = set()

for feature1, feature2, corr in high_corr:
    if X[feature1].var() < X[feature2].var():
        to_drop.add(feature1)
    else:
        to_drop.add(feature2)

df_filtered = X.drop(columns=to_drop)

df_filtered.info()

In [10]:
X = df_filtered

In [11]:
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['category']).columns

In [None]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()
weights_dict

## Обучение модели

Подбор наилучших параметров через GridSearch

In [None]:
# from catboost import CatBoostClassifier, Pool
# from sklearn.model_selection import GridSearchCV
#
# model = CatBoostClassifier(
#     iterations=1000,
#     early_stopping_rounds=100,
#     cat_features=list(categorical_features),
#     random_seed=42,
#     eval_metric='AUC',
#     verbose=10,
#     task_type='GPU'
# )
#
# param_grid = {
#     "depth": [4, 6, 8, 10],
#     "learning_rate": [0.1, 0.3, 0.5],
#     "l2_leaf_reg": [1, 3, 5]
# }
#
# grid_search = GridSearchCV(model, param_grid, cv=2, verbose=10)
# grid_search.fit(x_train, y_train)
#
# print("Лучшие параметры:", grid_search.best_params_)

Обучение модели CatBoost

In [None]:
#Лучшие параметры: {'depth': 10, 'l2_leaf_reg': 1, 'learning_rate': 0.3}

model = CatBoostClassifier(
    iterations=1000,
    early_stopping_rounds=100,
    learning_rate=0.319,
    depth=10,
    l2_leaf_reg=1,
    cat_features=list(categorical_features),
    random_seed=42,
    eval_metric='AUC',
    verbose=10,
    task_type='GPU'
)

model.fit(
    x_train,
    y_train,
    eval_set=(x_val, y_val),
    plot=True
)

Зададим функцию для взвешенной метрики roc auc

In [None]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)
# предсказываем вероятности
y_pred_proba = model.predict_proba(x_val)
# смотрим на метрики
weighted_roc_auc(y_val, y_pred_proba, sorted(model.classes_), weights_dict)

Проверка работы модели

Предиктим старт класс


In [None]:
X1 = train_df.drop(['id', 'date', 'end_cluster', 'start_cluster'], axis=1)
y1 = train_df["start_cluster"]

# выделяем тольько те места, где есть старт кластер в тестовой части
X2 = test_df[test_df["date"] != "month_6"]
y2 = X2["start_cluster"]
X2 = X2.drop(["id", "date", "start_cluster"], axis=1)

# соединяем тренировочную и тестовую часть с присутствующим старт кластером
X_combined = pd.concat([X1, X2], ignore_index=True, axis=0)
y_combined = pd.concat([y1, y2], ignore_index=True, axis=0)
X_combined.info

In [16]:
numerical_features = X_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_combined.select_dtypes(include=['category']).columns


In [17]:
x_train1, x_val1, y_train1, y_val1 = train_test_split(X_combined, y_combined,
                                                  test_size=0.2,
                                                  random_state=19)

Второй GridSearch

In [None]:
#from catboost import CatBoostClassifier, Pool
#from sklearn.model_selection import GridSearchCV
#
#categorical_features2 = X1.select_dtypes(include=['category']).columns
#
#model_for_start_cluster = CatBoostClassifier(
#    iterations=1000,
#    early_stopping_rounds=100,
#    cat_features=list(categorical_features2),
#    eval_metric='Accuracy',
#    #verbose=20,
#    random_seed=42,
#    task_type='GPU'
#)
#
#param_grid = {
#    "depth": [6, 8, 10, 12],
#    "learning_rate": [0.1, 0.3, 0.5],
#    "l2_leaf_reg": [1, 3, 5]
#}
#
#grid_search = GridSearchCV(model_for_start_cluster, param_grid, cv=2, verbose=10)
#grid_search.fit(x_train1, y_train1)
#
#print("Лучшие параметры:", grid_search.best_params_)

In [None]:
categorical_features2 = X1.select_dtypes(include=['category']).columns
# модель
model_for_start_cluster = CatBoostClassifier(
    iterations=1000,
    early_stopping_rounds=100,
    cat_features=list(categorical_features2),
    eval_metric='Accuracy',
    learning_rate=0.3,
    l2_leaf_reg = 1,
    depth=12,
    verbose=20,
    random_seed=42,
    task_type='GPU'
)
model_for_start_cluster.fit(x_train1, y_train1, eval_set=(x_val1, y_val1), plot=True)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

start_cluster_y_pred = model_for_start_cluster.predict(x_val1)

print(accuracy_score(y_val1, start_cluster_y_pred))
print(f1_score(y_val1, start_cluster_y_pred, average='macro'))

## Прогноз на тестовой выборке

In [None]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

Для того, чтобы сделать прогноз на тестовой выборке, нужно заполнить стартовый кластер. </br>
В качестве базового подхода заполним все стартовые кластеры, самым популярным кластером.

In [20]:
test_df["start_cluster"] = test_df["start_cluster"].astype("category")

In [None]:
test_df['start_cluster'].value_counts()

In [23]:
sample_submission_df = pd.read_csv("sample_submission.csv")

Для тестовой выборки будем использовать только последний месяц

In [None]:
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)
#last_m_test_df = last_m_test_df.drop(to_drop, axis=1)
last_m_test_df['start_cluster']

In [None]:
Xlm = last_m_test_df.drop(["start_cluster"], axis=1)
# Xlm = Xlm.drop(columns=to_drop)
last_m_test_df['start_cluster'] = model_for_start_cluster.predict(Xlm).flatten()
last_m_test_df['start_cluster']

Сохраняем тестовую выборку с предсказанным старт кластером

In [None]:
# last_m_test_df['start_cluster'].to_csv('lmtdfHugeSuperImba.csv')

Читаем из сохранения

In [None]:
# stcl = pd.read_csv('lmtdfHuge.csv')['start_cluster']
# last_m_test_df['start_cluster'] = stcl.to_numpy()
# last_m_test_df['start_cluster']

In [None]:
last_m_test_df.info()

In [27]:
last_m_test_df = last_m_test_df.drop(columns=to_drop)

In [28]:
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [None]:
test_pred_proba_df.shape

In [None]:
test_pred_proba_df.head(2)

In [29]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("submission.csv", index=False)

In [None]:
# важность признаков
feature_importance = model.get_feature_importance()
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df[importance_df['importance'] < 1]
importance_df = importance_df.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 14))
sns.barplot(x='importance', y='feature', data=importance_df)
plt.title('CatBoost Feature Importance', fontsize=15)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()
plt.show()