# Imports and infrastructure

In [None]:
# !pip install --upgrade transformers==4.49.0
# !pip install optuna==2.10.0
# !pip install numpy==1.26.4 gensim==4.3.2
# !pip install scipy==1.12.0
# !pip install --upgrade pandas==2.2.2
# !pip install h3
# !pip install mlflow
# !pip install 'protobuf<4'
# !pip install selenium
# !pip install natasha
# !pip install pymystem3
# !pip install symspellpy
!rm -rf /content/Price-prediction-with-textual-data

In [None]:
!git clone https://github.com/anna-k-00/Price-prediction-with-textual-data.git

In [None]:
# Шаг 1: Проверка и настройка окружения
import os
import sys
import importlib

# Шаг 2: Клонирование/обновление репозитория
repo_url = 'https://github.com/anna-k-00/Price-prediction-with-textual-data.git'
repo_dir = 'Price-prediction-with-textual-data'

if not os.path.exists(repo_dir):
    !git clone {repo_url}
else:
    !cd {repo_dir} && git pull

# Шаг 3: Добавляем все нужные пути в sys.path
paths_to_add = [
    f'/content/{repo_dir}',                     # Для файлов в корне (parser_avito.py)
    f'/content/{repo_dir}/main_methods',        # Основные модули
    f'/content/{repo_dir}/embeddings_generation', # Генерация эмбеддингов
    f'/content/{repo_dir}/preprocessors'        # Препроцессоры
]

for path in paths_to_add:
    if os.path.exists(path) and path not in sys.path:
        sys.path.insert(0, path)
        print(f'Добавлен путь: {path}')

# Шаг 4: Собираем список всех модулей для импорта
all_modules = [
    # Основные модули
    'resource_monitor', 'ANN', 'predict', 'test_pipeline',

    # Модули из embeddings_generation
    'embeddings_generation.rubert_fine_tuning',
    'embeddings_generation.tfidf_generator',
    'embeddings_generation.w2v_generator',
    'embeddings_generation.gate',

    # Модули из preprocessors
    'preprocessors.preprocessor_params_hex',
    'preprocessors.preprocessor_text',

    # Отдельные файлы в корне
    'parser_avito'
]

# Шаг 5: Импортируем все модули
imported_modules = {}
failed_modules = {}

for module_name in all_modules:
    try:
        module = importlib.import_module(module_name)
        imported_modules[module_name] = module
        print(f'✅ {module_name} успешно импортирован')
    except Exception as e:
        failed_modules[module_name] = str(e)
        print(f'❌ Ошибка импорта {module_name}: {str(e)[:200]}')  # Обрезаем длинные сообщения

In [None]:
import logging
import os
import time
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm
import mlflow
import mlflow.sklearn
import warnings
from transformers import AutoModel, AutoTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from sklearn.svm import LinearSVR
from sklearn.base import BaseEstimator, TransformerMixin
import random
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import issparse
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
import torch
import torch.nn as nn
import os
import joblib
import transformers
import json
import joblib
from datetime import datetime
from sklearn.decomposition import PCA
import time
import sys
import platform
import psutil
import threading

try:
    import pynvml
    pynvml.nvmlInit()
    HAS_NVML = True
except Exception:
    HAS_NVML = False

from ANN import ANNRegressor
from resource_monitor import ResourceMonitor
from test_pipeline import PricePredictionExperiment
from preprocessor_params_hex import DataProcessingPipeline

from tfidf_generator import TfidfTransformer
from w2v_generator import Word2VecTransformer
from rubert_fine_tuning import RuBertTiny2Embedder

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# mkdir -p "/content/drive/My Drive/mlflow_data_DROP"

In [None]:
# set your ml flow directory

mlflow.set_tracking_uri('file:///content/drive/My Drive/mlflow_data_DROP')

In [None]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

# Анализ

т.к ml flow folder с большим количеством runs становится сложно итерировать для поиска runs и collab запрещает поднимать лолкальные сервера лдя работы с интерфейсом, в центральном пайплайне для эксперимнета мы дублировали сохранение основных файлов экспериментов (метрики, предсказания, параметры, модели, трансформеры, токенизаторы) в локальные папки архивы всех тренировок доступны в папке https://drive.google.com/drive/folders/10uxDBjledOSIg6biJpLv6WgCMVQqzesT?usp=sharing

Results_pca - https://drive.google.com/drive/folders/1kFCVdfSFN3nQHjhctiebiFOmlbGzYdDW?usp=share_link - основной фолдер с результатами первых 28 экспериментов
manual_text_features_negative - https://drive.google.com/drive/folders/1cUMsJQWtKzEUzXahFsHnDw6O1lNZAnkp?usp=sharing - фолдер с резулттатами доп экспериментов с только негативными ручнми текстовыми признаками
manual_text_features - https://drive.google.com/drive/folders/1uEKoHcxSOnZeDUjzoudHBJ2Uk99dFW2e?usp=share_link - фолдер с резулттатами доп экспериментов со всеми ручнми текстовыми признаками
доступы открыты и для простоты аналитики мы будем брать результаты осноных экспериментов за март оттуда




самый простой способ получить доступ к нашему архиву с результатами - по ссылке 'https://drive.google.com/drive/folders/10uxDBjledOSIg6biJpLv6WgCMVQqzesT?usp=sharing'

через интерфейс Google Drive:

Перейдите по нашей ссылке будучи авторизованными (доступы уже открыты)
Откройте Google Drive
Найдите папку в разделе "Доступно мне" (Shared with me)
Правой кнопкой мыши → "Добавить ярлык на Диск" (Add shortcut to Drive)
Выберите расположение в "Мой диск" → Нажмите "Добавить"

## Ручная систематизация

In [None]:
# import os
# import pandas as pd
# import json
# from datetime import datetime

# def process_model_metrics(model_short, model_long):
#     base_path = f"/content/drive/MyDrive/price_prediction_data/Results_pca/{model_short}/metrics"
#     output_path = f"/content/drive/MyDrive/price_prediction_data/Results_pca/{model_short}"

#     combinations = [
#         "categorical-only_none_pca_False_gate_False",
#         "mixed_rubert_pca_True_gate_False",
#         "mixed_w2v_pca_True_gate_False",
#         "mixed_tfidf_pca_True_gate_False",
#         "text-only_rubert_pca_True_gate_False",
#         "text-only_w2v_pca_True_gate_False",
#         "text-only_tfidf_pca_True_gate_False"
#     ]

#     for combo in combinations:
#         try:
#             # 1. Загрузка данных
#             summary_file = f"summary_metrics_price_prediction_{model_long}_{combo}.csv"
#             summary_path = os.path.join(base_path, summary_file)
#             cv_file = f"cv_metrics_price_prediction_{model_long}_{combo}.json"
#             cv_path = os.path.join(base_path, cv_file)
#             optuna_file = f"optuna_trials_price_prediction_{model_long}_{combo}.csv"
#             optuna_path = os.path.join(base_path, optuna_file)

#             if not all(os.path.exists(p) for p in [summary_path, cv_path, optuna_path]):
#                 print(f"Some files not found for {combo}")
#                 continue

#             summary_df = pd.read_csv(summary_path)
#             with open(cv_path, 'r') as f:
#                 cv_data = json.load(f)
#             optuna_df = pd.read_csv(optuna_path)

#             # 2. Создаем финальный DataFrame
#             final_df = pd.DataFrame()

#             # 3. Добавляем лучшие параметры
#             best_params = optuna_df.loc[optuna_df['value'].idxmax()].filter(regex='^params_')
#             for param, value in best_params.items():
#                 param_name = param.replace('params_', '').replace('__', '/')
#                 final_df = pd.concat([final_df, pd.DataFrame({
#                     'type': ['best_param'],
#                     'metric': [param_name],
#                     'value': [value],
#                     'mean': [None], 'std': [None], 'conf_interval': [None],
#                     **{f'fold_{i}': [None] for i in range(5)}  # 5 фолдов как в примере
#                 })], ignore_index=True)

#             # 4. Добавляем CV метрики в строгом порядке
#             cv_metrics_order = ['r2', 'rmse', 'smape', 'medape']
#             for scale in ['log', 'orig']:
#                 cv_key = f"{scale}_metrics"
#                 if cv_key in cv_data:
#                     for metric in cv_metrics_order:
#                         if metric in cv_data[cv_key]:
#                             values = cv_data[cv_key][metric]['values']
#                             final_df = pd.concat([final_df, pd.DataFrame({
#                                 'type': [f'cv_{scale}'],
#                                 'metric': [metric],
#                                 'value': [None],
#                                 'mean': [cv_data[cv_key][metric]['mean']],
#                                 'std': [cv_data[cv_key][metric]['std']],
#                                 'conf_interval': [cv_data[cv_key][metric]['conf_interval']],
#                                 **{f'fold_{i}': [values[i] if i < len(values) else None] for i in range(5)}
#                             })], ignore_index=True)

#             # 5. Добавляем тестовые метрики в строгом порядке
#             test_metrics_order = ['r2', 'rmse', 'smape', 'medape']
#             for scale in ['log', 'orig']:
#                 for metric in test_metrics_order:
#                     test_row = summary_df[(summary_df['type'] == f'test_{scale}') &
#                                         (summary_df['metric'] == metric)]
#                     if not test_row.empty:
#                         final_df = pd.concat([final_df, pd.DataFrame({
#                             'type': [f'test_{scale}'],
#                             'metric': [metric],
#                             'value': [test_row['value'].values[0]],
#                             'mean': [None], 'std': [None], 'conf_interval': [None],
#                             **{f'fold_{i}': [None] for i in range(5)}
#                         })], ignore_index=True)

#             # 6. Формируем имя файла с учетом типа комбинации
#             parts = combo.split('_')
#             if parts[0] == 'categorical-only':
#                 combo_name = f"{parts[0]}_{parts[1]}"
#             elif parts[0] == 'text-only':
#                 combo_name = f"{parts[0]}_{parts[1]}"
#             elif parts[0] == 'mixed':
#                 combo_name = f"{parts[0]}_{parts[1]}"  # mixed_w2v, mixed_tfidf, mixed_rubert
#             else:
#                 combo_name = combo  # fallback

#             # 7. Сохраняем результат
#             output_file = f"metrics_{model_long}_{combo_name}_FINAL.csv"
#             output_file_path = os.path.join(output_path, output_file)

#             final_df.to_csv(output_file_path, index=False)
#             print(f"Successfully saved: {output_file_path}")

#         except Exception as e:
#             print(f"Error processing {combo} for {model_long}: {str(e)}")

# # Обработка всех моделей
# models = [
#     ("RFR", "RandomForestRegressor"),
#     ("LinearSVR", "LinearSVR"),
#     ("ANN", "ANNRegressor"),
#     ("XGBR", "XGBRegressor")
# ]

# for model_short, model_long in models:
#     print(f"\nProcessing model: {model_long}")
#     process_model_metrics(model_short, model_long)

In [None]:
import os
import pandas as pd
from pathlib import Path

# Путь к основной папке с результатами
base_path = "/content/drive/MyDrive/price_prediction_data/Results_pca"

# Список моделей
models = ["ANN", "LinearSVR", "RFR", "XGBR"]

# Функция для форматирования значений метрик
def format_metric(metric, value, conf_interval=None, scale='log'):
    # Проверяем, является ли value строкой и пытаемся преобразовать в float
    if isinstance(value, str):
        try:
            value = float(value)
        except ValueError:
            return "NA"  # если преобразование невозможно

    # Проверяем conf_interval аналогично
    if conf_interval is not None and isinstance(conf_interval, str):
        try:
            conf_interval = float(conf_interval)
        except ValueError:
            conf_interval = None

    if pd.isna(value):
        return "NA"

    if scale == 'log':
        # Для log шкалы: все метрики с 3 знаками
        if conf_interval is not None:
            return f"{round(value, 3)} ± {round(conf_interval, 3)}"
        return round(value, 3)
    else:
        # Для original шкалы: rmse округляем до целых, остальные до 3 знаков
        if metric == 'rmse':
            if conf_interval is not None:
                return f"{int(round(value))} ± {int(round(conf_interval))}"
            return int(round(value))
        else:
            if conf_interval is not None:
                return f"{round(value, 3)} ± {round(conf_interval, 3)}"
            return round(value, 3)

# Функция для извлечения информации из имени файла
def parse_filename(filename):
    parts = filename.split('_')
    model = parts[1]

    if "categorical-only" in filename:
        data_type = "non-textual"
        embedding = "None"
    elif "text-only" in filename:
        data_type = "textual"
        embedding = parts[3]
    elif "mixed" in filename:
        data_type = "mixed"
        embedding = parts[3]
    else:
        data_type = "unknown"
        embedding = "unknown"

    return model, data_type, embedding

# Списки для хранения данных
cv_log_data = []
cv_orig_data = []
test_log_data = []
test_orig_data = []
params_data = []

# Проход по всем папкам и файлам
for model in models:
    model_path = os.path.join(base_path, model)

    if not os.path.exists(model_path):
        continue

    for filename in os.listdir(model_path):
        if filename.startswith("metrics_") and filename.endswith(".csv"):
            filepath = os.path.join(model_path, filename)

            # Парсинг информации из имени файла
            model_name, data_type, embedding = parse_filename(filename)

            # Чтение CSV файла
            df = pd.read_csv(filepath)

            # Извлечение параметров
            params = df[df['type'] == 'best_param'][['metric', 'value']]
            for _, row in params.iterrows():
                params_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'param': row['metric'],
                    'value': row['value']
                })

            # Извлечение метрик кросс-валидации (log)
            cv_log = df[(df['type'] == 'cv_log')][['metric', 'mean', 'conf_interval']]
            for _, row in cv_log.iterrows():
                formatted_value = format_metric(
                    row['metric'],
                    row['mean'],
                    row['conf_interval'],
                    scale='log'
                )
                cv_log_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'metric': row['metric'],
                    'value': formatted_value
                })

            # Извлечение метрик кросс-валидации (original)
            cv_orig = df[(df['type'] == 'cv_orig')][['metric', 'mean', 'conf_interval']]
            for _, row in cv_orig.iterrows():
                formatted_value = format_metric(
                    row['metric'],
                    row['mean'],
                    row['conf_interval'],
                    scale='orig'
                )
                cv_orig_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'metric': row['metric'],
                    'value': formatted_value
                })

            # Извлечение метрик теста (log)
            test_log = df[(df['type'] == 'test_log')][['metric', 'value']]
            for _, row in test_log.iterrows():
                formatted_value = format_metric(
                    row['metric'],
                    row['value'],
                    scale='log'
                )
                test_log_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'metric': row['metric'],
                    'value': formatted_value
                })

            # Извлечение метрик теста (original)
            test_orig = df[(df['type'] == 'test_orig')][['metric', 'value']]
            for _, row in test_orig.iterrows():
                formatted_value = format_metric(
                    row['metric'],
                    row['value'],
                    scale='orig'
                )
                test_orig_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'metric': row['metric'],
                    'value': formatted_value
                })

# Создание DataFrame для каждой таблицы
df_cv_log = pd.DataFrame(cv_log_data)
df_cv_orig = pd.DataFrame(cv_orig_data)
df_test_log = pd.DataFrame(test_log_data)
df_test_orig = pd.DataFrame(test_orig_data)
df_params = pd.DataFrame(params_data)

# Преобразование таблиц в более удобный вид (pivot)
def create_pivot_table(df, value_col='value'):
    return df.pivot_table(
        index=['model', 'data_type', 'embedding'],
        columns='metric',
        values=value_col,
        aggfunc='first'
    ).reset_index()

# Создание итоговых таблиц
final_cv_log = create_pivot_table(df_cv_log)
final_cv_orig = create_pivot_table(df_cv_orig)
final_test_log = create_pivot_table(df_test_log)
final_test_orig = create_pivot_table(df_test_orig)

# Для параметров немного другая структура
final_params = df_params.pivot_table(
    index=['model', 'data_type', 'embedding', 'param'],
    values='value',
    aggfunc='first'
).reset_index()

# Переименование столбцов для лучшей читаемости
metric_rename = {
    'r2': 'R²',
    'rmse': 'RMSE',
    'smape': 'SMAPE'
}

for df in [final_cv_log, final_cv_orig, final_test_log, final_test_orig]:
    df.rename(columns=metric_rename, inplace=True)

# Сохранение таблиц в CSV файлы (опционально)
final_cv_log.to_csv('cross_validation_log_scale.csv', index=False)
final_cv_orig.to_csv('cross_validation_original_scale.csv', index=False)
final_test_log.to_csv('test_log_scale.csv', index=False)
final_test_orig.to_csv('test_original_scale.csv', index=False)
final_params.to_csv('best_params.csv', index=False)

# # Вывод таблиц для проверки
# print("Cross Validation (Log Scale):")
# print(final_cv_log)
# print("\nCross Validation (Original Scale):")
# print(final_cv_orig)
# print("\nTest Metrics (Log Scale):")
# print(final_test_log)
# print("\nTest Metrics (Original Scale):")
# print(final_test_orig)
# print("\nBest Parameters:")
# print(final_params)

In [None]:
final_params['param'] = final_params['param'].str.split('/').str[-1]

# Save to CSV without index
final_params.to_csv('best_params.csv', index=False)

In [None]:
# !pip install scikit_posthocs

In [None]:
import os

base_path = '/content/drive/MyDrive/price_prediction_data/Results_pca/'

# Список для хранения найденных файлов
prediction_files = []

# Рекурсивный обход всех подпапок
for root, dirs, files in os.walk(base_path):
    # Проверяем, является ли текущая папка папкой 'metrics'
    if os.path.basename(root) == 'metrics':
        # Проверяем все файлы в этой папке
        for file in files:
            # Если в названии файла есть 'predictions' (регистронезависимо)
            if 'predictions' in file.lower():
                full_path = os.path.join(root, file)
                prediction_files.append(full_path)

# Выводим результаты
for file_path in prediction_files:
    print(file_path)

In [None]:
import os

base_path = '/content/drive/MyDrive/price_prediction_data/Results_pca/'

# Список для хранения найденных файлов
prediction_files = []

# Рекурсивный обход всех подпапок
for root, dirs, files in os.walk(base_path):
  if os.path.basename(root) != 'metrics':
    for file in files:
      if 'FINAL' in file:
        full_path = os.path.join(root, file)
        prediction_files.append(full_path)
    # # Проверяем, является ли текущая папка папкой 'metrics'
    #   for file in files:
    #     if 'Final' in file:
    #       # Если в названии файла есть 'predictions' (регистронезависимо)
    #         full_path = os.path.join(root, file)
    #         prediction_files.append(full_path)

# Выводим результаты
for file_path in prediction_files:
    print(file_path)

In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, wilcoxon
import glob
from tqdm import tqdm

# ============ [0] Словари нормализации и базовые функции ============
MODEL_MAP = {
    "ANNRegressor": "ANN",
    "RandomForestRegressor": "RFR",
    "LinearSVR": "LinearSVR",
    "XGBRegressor": "XGBoost"
}
MODEL_ORDER = {"XGBoost": 0, "RFR": 1, "ANN": 2, "LinearSVR": 3}
DATASET_MAP = {
    "categorical-only": "non-textual",
    "mixed": "mixed",
    "text-only": "text-only"
}
DATASET_ORDER = {"non-textual": 0, "mixed": 1, "text-only": 2}
EMBEDDING_MAP = {
    "rubert": "rubert",
    "w2v": "w2v",
    "tfidf": "tfidf",
    "none": "none"
}
EMBEDDING_ORDER = {"none": 0, "rubert": 1, "w2v": 2, "tfidf": 3}

def map_name(raw, mapping):
    return mapping.get(raw, raw)

def format_score(x, rmse_millions=False):
    if pd.isnull(x): return ""
    if isinstance(x, float) or isinstance(x, np.floating):
        if rmse_millions:
            return f'{x/1e6:.3f}'
        if abs(x) > 10000:
            return f'{x:,.0f}'.replace(',', ' ')
        return f'{x:.4f}'
    if isinstance(x, int):
        return f'{x:,}'.replace(',', ' ')
    return x

def pretty_ci(mean, low, high, rmse_millions=False, precision=3):
    mean_val = float(mean)
    delta = (float(high) - float(low))/2
    if rmse_millions:
        return f'{mean_val/1e6:.{precision}f} ± {delta/1e6:.{precision}f}'
    if abs(mean_val) > 10000 or abs(delta) > 10000:
        return f'{int(mean_val):,} ± {int(delta):,}'.replace(',', ' ')
    return f'{mean_val:.4f} ± {delta:.4f}'

def sort_df(df):
    if "model" in df.columns:
        df['model_order'] = df['model'].map(MODEL_ORDER)
    if "data_type" in df.columns:
        df['dataset_order'] = df['data_type'].map(DATASET_ORDER)
    elif "dataset" in df.columns:
        df['dataset_order'] = df['dataset'].map(DATASET_ORDER)
    if "embedding" in df.columns:
        df['embedding_order'] = df['embedding'].map(EMBEDDING_ORDER)
    sort_cols = []
    for col in ["model_order", "dataset_order", "embedding_order"]:
        if col in df.columns:
            sort_cols.append(col)
    if sort_cols:
        df = df.sort_values(sort_cols)
    for col in ["model_order", "dataset_order", "embedding_order"]:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

# ============ [1] Получение финальной сводной таблицы метрик CV для 1.1 ============
def format_metric(metric, value, conf_interval=None, scale='log'):
    if isinstance(value, str):
        try:
            value = float(value)
        except ValueError:
            return "NA"
    if conf_interval is not None and isinstance(conf_interval, str):
        try:
            conf_interval = float(conf_interval)
        except ValueError:
            conf_interval = None
    if pd.isna(value):
        return "NA"
    if scale == 'log':
        if conf_interval is not None:
            return f"{round(value, 3)} ± {round(conf_interval, 3)}"
        return round(value, 3)
    else:
        if metric == 'rmse':
            if conf_interval is not None:
                return f"{int(round(value))} ± {int(round(conf_interval))}"
            return int(round(value))
        else:
            if conf_interval is not None:
                return f"{round(value, 3)} ± {round(conf_interval, 3)}"
            return round(value, 3)

def parse_filename(filename):
    parts = filename.split('_')
    model = parts[1]
    if "categorical-only" in filename:
        data_type = "non-textual"
        embedding = "none"
    elif "text-only" in filename:
        data_type = "text-only"
        embedding = parts[3]
    elif "mixed" in filename:
        data_type = "mixed"
        embedding = parts[3]
    else:
        data_type = "unknown"
        embedding = "unknown"
    return map_name(model, MODEL_MAP), data_type, embedding

base_path = "/content/drive/MyDrive/price_prediction_data/Results_pca"
models = ["ANN", "LinearSVR", "RFR", "XGBR"]
cv_orig_data = []
for model in models:
    model_path = os.path.join(base_path, model)
    if not os.path.exists(model_path):
        continue
    for filename in os.listdir(model_path):
        if filename.startswith("metrics_") and filename.endswith(".csv"):
            filepath = os.path.join(model_path, filename)
            model_name, data_type, embedding = parse_filename(filename)
            df = pd.read_csv(filepath)
            cv_orig = df[df['type'] == 'cv_orig'][['metric', 'mean', 'conf_interval']]
            for _, row in cv_orig.iterrows():
                formatted_value = format_metric(
                    row['metric'],
                    row['mean'],
                    row['conf_interval'],
                    scale='orig'
                )
                cv_orig_data.append({
                    'model': model_name,
                    'data_type': data_type,
                    'embedding': embedding,
                    'metric': row['metric'],
                    'value': formatted_value
                })
df_cv_orig = pd.DataFrame(cv_orig_data)
def create_pivot_table(df, value_col='value'):
    return df.pivot_table(
        index=['model', 'data_type', 'embedding'],
        columns='metric',
        values=value_col,
        aggfunc='first'
    ).reset_index()
metric_rename = {
    'r2': 'R²',
    'rmse': 'RMSE',
    'smape': 'SMAPE'
}
final_cv_orig = create_pivot_table(df_cv_orig)
final_cv_orig.rename(columns=metric_rename, inplace=True)
final_cv_orig = sort_df(final_cv_orig)

# ============ [2] Загрузка тестовых предсказаний ============
print("\n=== [2] Загрузка тестовых предсказаний ===")
pred_files = glob.glob('/content/drive/MyDrive/price_prediction_data/Results_pca/*/metrics/predictions_test_price_prediction_*.csv')
preds_list = []
for pf in tqdm(pred_files):
    try:
        pred_df = pd.read_csv(pf)
        fname = os.path.basename(pf)
        arr = fname.split('_')
        model = map_name(arr[4], MODEL_MAP)
        dataset = map_name(arr[5], DATASET_MAP)
        embedding = map_name(arr[6], EMBEDDING_MAP)
        preds_list.append({
            'model': model,
            'dataset': dataset,
            'embedding': embedding,
            'true_price': pred_df['true_price'].values,
            'predicted_price': pred_df['predicted_price'].values,
        })
        print(f"  Загружен: {model} | {dataset} | {embedding} | {len(pred_df)} точек")
    except Exception as e:
        print(f'Error reading {pf}: {e}')
print(f"Итого успешно загружено {len(preds_list)} файлов с предсказаниями.\n")

# ============ [3] Holdout сводка ============
def bootstrap_point_metric(true, pred, n_bootstrap=1000):
    stats = []
    n = len(true)
    for _ in range(n_bootstrap):
        idx = np.random.choice(np.arange(n), size=n, replace=True)
        r2 = 1 - np.sum((true[idx]-pred[idx])**2) / np.sum((true[idx] - np.mean(true[idx]))**2)
        rmse = np.sqrt(np.mean((true[idx]-pred[idx])**2))
        medape = np.median(np.abs((true[idx] - pred[idx]) / true[idx])) * 100
        stats.append((r2, rmse, medape))
    stats = np.array(stats)
    res = {
        'R²holdout': np.mean(stats[:,0]), 'R²holdout_low': np.percentile(stats[:,0],2.5), 'R²holdout_high': np.percentile(stats[:,0],97.5),
        'RMSEholdout': np.mean(stats[:,1]), 'RMSEholdout_low': np.percentile(stats[:,1],2.5), 'RMSEholdout_high': np.percentile(stats[:,1],97.5),
        'MedAPEholdout': np.mean(stats[:,2]), 'MedAPEholdout_low': np.percentile(stats[:,2],2.5), 'MedAPEholdout_high': np.percentile(stats[:,2],97.5)
    }
    return res

def holdout_metrics_table(preds_list, metrics_names=None):
    out = []
    for d in preds_list:
        stats = bootstrap_point_metric(np.array(d['true_price']), np.array(d['predicted_price']))
        out.append({
            'model': d['model'],
            'dataset': d['dataset'],
            'embedding': d['embedding'],
            **stats
        })
    df = pd.DataFrame(out)
    if metrics_names is None:
        metrics_names = {
            'R²holdout': r'$R^2_{\text{holdout}}$',
            'RMSEholdout': r'$\mathrm{RMSE}_{\text{holdout}},$ млн',
            'MedAPEholdout': r'$\mathrm{MedAPE}_{\text{holdout}},$ %'
        }
    df['R²holdout ±CI'] = df.apply(lambda r: pretty_ci(r['R²holdout'], r['R²holdout_low'], r['R²holdout_high']), axis=1)
    df['RMSEholdout ±CI, млн'] = df.apply(lambda r: pretty_ci(r['RMSEholdout'], r['RMSEholdout_low'], r['RMSEholdout_high'], rmse_millions=True), axis=1)
    df['MedAPEholdout ±CI, %'] = df.apply(lambda r: pretty_ci(r['MedAPEholdout'], r['MedAPEholdout_low'], r['MedAPEholdout_high']), axis=1)
    return sort_df(df[['model','dataset','embedding','R²holdout ±CI','RMSEholdout ±CI, млн','MedAPEholdout ±CI, %']])

test_metrics_df_vis = holdout_metrics_table(preds_list)

# ============ [4] Бутстрап и сегменты ============
def get_segments(y):
    quant = np.quantile(y, 0.85)
    idx_low = np.where(y <= quant)[0]
    idx_high = np.where(y > quant)[0]
    return idx_low, idx_high

def build_segmented_predslists(preds_list):
    seg_preds = {'low': [], 'high': []}
    for d in preds_list:
        y = np.array(d['true_price'])
        y_pred = np.array(d['predicted_price'])
        idx_low, idx_high = get_segments(y)
        seg_preds['low'].append({**d, 'true_price': y[idx_low], 'predicted_price': y_pred[idx_low]})
        seg_preds['high'].append({**d, 'true_price': y[idx_high], 'predicted_price': y_pred[idx_high]})
    return seg_preds

seg_preds = build_segmented_predslists(preds_list)
test_metrics_df_vis_low = holdout_metrics_table(seg_preds['low'])
test_metrics_df_vis_high = holdout_metrics_table(seg_preds['high'])

# ============ [5] Остальные CV-таблицы ============
def friedman_posthoc_report(data, metric, group_vars=['model', 'dataset']):
    best_by_group = []
    for key, subdf in data[data['metric'] == metric].groupby(group_vars):
        try:
            piv = subdf.set_index('embedding')[[f'fold_{i}' for i in range(5)]]
            means = piv.mean(axis=1)
            n_var = len(piv)
            best = means.idxmax() if metric == 'r2' else means.idxmin()
            best_score = means[best]
            second, second_score = (None, None)
            p_value = None
            stat_significant = None
            comment = ""
            if n_var > 1:
                second = means.drop(best).idxmax() if metric == 'r2' else means.drop(best).idxmin()
                second_score = means[second]
            if n_var == 1:
                comment = "Only one embedding."
            elif n_var == 2:
                v1, v2 = piv.index
                s, p_w = wilcoxon(piv.loc[v1], piv.loc[v2])
                p_value = p_w
                stat_significant = p_w < 0.05
                comment = (f"Best embedding: {best}, score={format_score(best_score)}" +
                           (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                            f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
            else:
                stat, p = friedmanchisquare(*[row.values for _, row in piv.iterrows()])
                p_value = p
                stat_significant = p < 0.05
                comment = (f"Best embedding: {best}, score={format_score(best_score)}" +
                           (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                            f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
            best_by_group.append({
                **dict(zip(group_vars, key)),
                'metric': metric,
                'best_embedding': best,
                'best_score': format_score(best_score),
                'second_embedding': second,
                'second_score': format_score(second_score) if second_score is not None else None,
                'stat_significant': stat_significant,
                'p_value': format_score(p_value) if p_value is not None else None,
                'comment': comment
            })
        except Exception as e:
            print(f'Error for {key}: {e}')
    return pd.DataFrame(best_by_group)

def best_dataset_by_model_report(best_embeds, df_metrics):
    out = []
    for metric in ['r2', 'rmse', 'medape']:
        for model, sdf in best_embeds[best_embeds['metric']==metric].groupby('model'):
            variants = []
            for _, row in sdf.iterrows():
                ds = row['dataset']
                embed = row['best_embedding']
                v = df_metrics[(df_metrics['model']==model)&
                               (df_metrics['dataset']==ds)&
                               (df_metrics['embedding']==embed)&
                               (df_metrics['metric']==metric)]
                if not v.empty:
                    vals = v[[f'fold_{i}' for i in range(5)]].values.flatten()
                    variants.append((ds, vals))
            piv = pd.DataFrame({ds: vals for ds, vals in variants}).T
            means = piv.mean(axis=1)
            n_var = len(piv)
            best = means.idxmax() if metric == 'r2' else means.idxmin()
            best_score = means[best]
            second, second_score = (None, None)
            p_value = None
            stat_significant = None
            comment = ""
            if n_var > 1:
                second = means.drop(best).idxmax() if metric == 'r2' else means.drop(best).idxmin()
                second_score = means[second]
            if n_var == 1:
                comment = "Only one dataset."
            elif n_var == 2:
                v1, v2 = piv.index
                s, p_w = wilcoxon(piv.loc[v1], piv.loc[v2])
                p_value = p_w
                stat_significant = p_w < 0.05
                comment = (f"Best dataset: {best}, score={format_score(best_score)}" +
                           (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                            f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
            else:
                stat, p = friedmanchisquare(*[row.values for _, row in piv.iterrows()])
                p_value = p
                stat_significant = p < 0.05
                comment = (f"Best dataset: {best}, score={format_score(best_score)}" +
                           (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                            f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
            out.append({'model': model, 'metric': metric,
                        'best_dataset': best, 'best_score': format_score(best_score),
                        'second_dataset': second, 'second_score': format_score(second_score) if second_score is not None else None,
                        'stat_significant': stat_significant, 'p_value': format_score(p_value) if p_value is not None else None,
                        'comment': comment})
    df = pd.DataFrame(out)
    return sort_df(df)

def best_model_by_metric_report(best_datasets, best_embeds, df_metrics):
    out = []
    for metric in ['r2', 'rmse', 'medape']:
        variants = []
        for _, row in best_datasets[best_datasets['metric']==metric].iterrows():
            model = row['model']
            ds = row['best_dataset']
            embed = best_embeds[(best_embeds['model']==model)&(best_embeds['dataset']==ds)&(best_embeds['metric']==metric)]['best_embedding'].values[0]
            v = df_metrics[(df_metrics['model']==model)&(df_metrics['dataset']==ds)&(df_metrics['embedding']==embed)&(df_metrics['metric']==metric)]
            if not v.empty:
                vals = v[[f'fold_{i}' for i in range(5)]].values.flatten()
                variants.append((model, vals))
        piv = pd.DataFrame({model: vals for model, vals in variants}).T
        means = piv.mean(axis=1)
        n_var = len(piv)
        best = means.idxmax() if metric == 'r2' else means.idxmin()
        best_score = means[best]
        second, second_score = (None, None)
        p_value = None
        stat_significant = None
        comment = ""
        if n_var > 1:
            second = means.drop(best).idxmax() if metric == 'r2' else means.drop(best).idxmin()
            second_score = means[second]
        if n_var == 1:
            comment = "Only one model."
        elif n_var == 2:
            v1, v2 = piv.index
            s, p_w = wilcoxon(piv.loc[v1], piv.loc[v2])
            p_value = p_w
            stat_significant = p_w < 0.05
            comment = (f"Best model: {best}, score={format_score(best_score)}" +
                       (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                        f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
        else:
            stat, p = friedmanchisquare(*[row.values for _, row in piv.iterrows()])
            p_value = p
            stat_significant = p < 0.05
            comment = (f"Best model: {best}, score={format_score(best_score)}" +
                       (f" (statistically significant, p={p_value:.4f})" if stat_significant else
                        f"; second best: {second}, score={format_score(second_score)} (NOT statistically significant, p={p_value:.4f})"))
        out.append({'metric': metric,
                    'best_model': best, 'best_score': format_score(best_score),
                    'second_model': second, 'second_score': format_score(second_score) if second_score is not None else None,
                    'stat_significant': stat_significant, 'p_value': format_score(p_value) if p_value is not None else None,
                    'comment': comment})
    return sort_df(pd.DataFrame(out))

# (df_metrics нужен для CV-таблиц, предположим что вы загружаете его где-то выше — не трогаем эту логику)

# ============ [6] Bootstrap по всему holdout и сегментам ============
def calc_metrics(true, pred):
    r2 = 1 - np.sum((true - pred)**2) / np.sum((true - np.mean(true))**2)
    rmse = np.sqrt(np.mean((true - pred)**2))
    medape = np.median(np.abs((true - pred) / true)) * 100
    return {'r2': r2, 'rmse': rmse, 'medape': medape}

def bootstrap_compare_preds(arr1, arr2, metric, n_bootstrap=1000, random_state=42):
    np.random.seed(random_state)
    n = min(len(arr1['true']), len(arr2['true']))
    indices = np.arange(n)
    diffs = []
    for _ in range(n_bootstrap):
        idx = np.random.choice(indices, size=n, replace=True)
        m1 = calc_metrics(arr1['true'][idx], arr1['pred'][idx])[metric]
        m2 = calc_metrics(arr2['true'][idx], arr2['pred'][idx])[metric]
        diff = m1 - m2 if metric == 'r2' else m2 - m1
        diffs.append(diff)
    diffs = np.array(diffs)
    ci_low, ci_high = np.percentile(diffs, [2.5, 97.5])
    return np.mean(diffs), ci_low, ci_high

def bootstrap_embeddings(preds_list, metric, segment_name=None):
    rows = []
    df = pd.DataFrame(preds_list)
    for model in sorted(df['model'].unique(), key=lambda m: MODEL_ORDER.get(m, 100)):
        for dataset in sorted(df['dataset'].unique(), key=lambda d: DATASET_ORDER.get(d, 100)):
            group = df[(df['model']==model) & (df['dataset']==dataset)]
            if group.shape[0] < 2:
                continue
            for_emb = {}
            for _, row in group.iterrows():
                for_emb[row['embedding']] = {'true': row['true_price'], 'pred': row['predicted_price']}
            emb_avail = list(for_emb.keys())
            if len(emb_avail) < 2: continue
            means = {emb: calc_metrics(for_emb[emb]['true'], for_emb[emb]['pred'])[metric] for emb in emb_avail}
            sorted_emb = sorted(means, key=lambda x: means[x], reverse=(metric=='r2'))
            best = sorted_emb[0]
            second = sorted_emb[1]
            best_score = means[best]
            second_score = means[second]
            mean_diff, ci_low, ci_high = bootstrap_compare_preds(for_emb[best], for_emb[second], metric)
            row_result = {
                'model': model, 'dataset': dataset, 'metric': metric,
                'best_embedding': best, 'best_score': format_score(best_score, rmse_millions=(metric=='rmse')),
                'second_embedding': second, 'second_score': format_score(second_score, rmse_millions=(metric=='rmse')),
                'diff': pretty_ci(mean_diff, ci_low, ci_high, rmse_millions=(metric=='rmse')),
            }
            if segment_name: row_result['segment'] = segment_name
            rows.append(row_result)
    return sort_df(pd.DataFrame(rows))

def bootstrap_datasets(preds_list, metric, segment_name=None):
    rows = []
    df = pd.DataFrame(preds_list)
    for model in sorted(df['model'].unique(), key=lambda m: MODEL_ORDER.get(m, 100)):
        group = df[df['model']==model]
        ds_map = {}
        for dataset in sorted(df['dataset'].unique(), key=lambda d: DATASET_ORDER.get(d, 100)):
            cur = group[group['dataset']==dataset]
            if cur.empty: continue
            best_emb = None
            best_val = None
            for emb in cur['embedding'].unique():
                vals = calc_metrics(cur[cur['embedding'] == emb].iloc[0]['true_price'],
                                   cur[cur['embedding'] == emb].iloc[0]['predicted_price'])[metric]
                if (best_val is None) or ((metric == 'r2' and vals > best_val) or (metric != 'r2' and vals < best_val)):
                    best_val = vals
                    best_emb = emb
            if best_emb is not None:
                row = cur[cur['embedding'] == best_emb].iloc[0]
                ds_map[dataset] = {'true': row['true_price'], 'pred': row['predicted_price']}
        ds_avail = list(ds_map.keys())
        if len(ds_avail) < 2: continue
        means = {ds: calc_metrics(ds_map[ds]['true'], ds_map[ds]['pred'])[metric] for ds in ds_avail}
        sorted_ds = sorted(means, key=lambda x: means[x], reverse=(metric=='r2'))
        best = sorted_ds[0]
        second = sorted_ds[1]
        best_score = means[best]
        second_score = means[second]
        mean_diff, ci_low, ci_high = bootstrap_compare_preds(ds_map[best], ds_map[second], metric)
        row_result = {
            'model': model, 'metric': metric,
            'best_dataset': best, 'best_score': format_score(best_score, rmse_millions=(metric=='rmse')),
            'second_dataset': second, 'second_score': format_score(second_score, rmse_millions=(metric=='rmse')),
            'diff': pretty_ci(mean_diff, ci_low, ci_high, rmse_millions=(metric=='rmse')),
        }
        if segment_name: row_result['segment'] = segment_name
        rows.append(row_result)
    return sort_df(pd.DataFrame(rows))

def bootstrap_models(preds_list, metric, segment_name=None):
    rows = []
    df = pd.DataFrame(preds_list)
    best_rows = []
    for model in sorted(df['model'].unique(), key=lambda m: MODEL_ORDER.get(m, 100)):
        group = df[df['model']==model]
        best_metric = None
        best_row = None
        for _, row in group.iterrows():
            val = calc_metrics(row['true_price'], row['predicted_price'])[metric]
            if (best_metric is None) or ((metric=='r2' and val > best_metric) or (metric!='r2' and val < best_metric)):
                best_metric = val
                best_row = row
        if best_row is not None:
            best_rows.append(best_row)
    if len(best_rows) < 2: return pd.DataFrame()
    mod_map = {row['model']: {'true': row['true_price'], 'pred': row['predicted_price']} for row in best_rows}
    means = {m: calc_metrics(mod_map[m]['true'], mod_map[m]['pred'])[metric] for m in mod_map}
    sorted_m = sorted(means, key=lambda x: means[x], reverse=(metric=='r2'))
    best = sorted_m[0]
    second = sorted_m[1]
    best_score = means[best]
    second_score = means[second]
    mean_diff, ci_low, ci_high = bootstrap_compare_preds(mod_map[best], mod_map[second], metric)
    row_result = {
        'metric': metric,
        'best_model': best, 'best_score': format_score(best_score, rmse_millions=(metric=='rmse')),
        'second_model': second, 'second_score': format_score(second_score, rmse_millions=(metric=='rmse')),
        'diff': pretty_ci(mean_diff, ci_low, ci_high, rmse_millions=(metric=='rmse')),
    }
    if segment_name: row_result['segment'] = segment_name
    rows.append(row_result)
    return sort_df(pd.DataFrame(rows))

# Все bootstrap
all_bootstrap_emb, all_bootstrap_ds, all_bootstrap_mod = [], [], []
for metric in ['r2', 'rmse', 'medape']:
    all_bootstrap_emb.append(bootstrap_embeddings(preds_list, metric))
    all_bootstrap_ds.append(bootstrap_datasets(preds_list, metric))
    all_bootstrap_mod.append(bootstrap_models(preds_list, metric))
bootstrap_emb = pd.concat(all_bootstrap_emb, ignore_index=True)
bootstrap_ds = pd.concat(all_bootstrap_ds, ignore_index=True)
bootstrap_mod = pd.concat(all_bootstrap_mod, ignore_index=True)

# По сегментам
segment_results = {}
for segment in ['high', 'low']:
    emb_list, ds_list, mod_list = [], [], []
    for metric in ['r2', 'rmse', 'medape']:
        emb_list.append(bootstrap_embeddings(seg_preds[segment], metric, segment_name=segment))
        ds_list.append(bootstrap_datasets(seg_preds[segment], metric, segment_name=segment))
        mod_list.append(bootstrap_models(seg_preds[segment], metric, segment_name=segment))
    segment_results[segment] = {
        'bootstrap_emb': pd.concat(emb_list, ignore_index=True),
        'bootstrap_ds': pd.concat(ds_list, ignore_index=True),
        'bootstrap_mod': pd.concat(mod_list, ignore_index=True),
        'metrics_overview': holdout_metrics_table(seg_preds[segment])
    }

# ============ [7] HTML-отчет ============
style = """
<style>
body { font-family: 'Times New Roman', Times, serif; font-size: 12pt; }
table { margin-left: auto; margin-right: auto; border-collapse: collapse; }
th, td { text-align: center; padding: 6px; }
h1, h2, h3 { text-align: center; }
</style>
"""
html_report = [style]
html_report.append('<h1>Model Comparison Report</h1>')

# ---- 1. Cross-validation ----
html_report.append('<h2>1. Cross validation</h2>')
html_report.append('<h3>1.1. Overview of cross-validation metrics (mean ±CI, original scale)</h3>')
html_report.append(final_cv_orig.to_html(index=False, justify='center'))
html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
html_report.append('<h3>1.2. Best embeddings per model and dataset (CV)</h3>')
html_report.append(best_embeds_vis[['model','dataset','metric','best_embedding','best_score','second_embedding','second_score','stat_significant','p_value','comment']].to_html(index=False, justify='center'))
html_report.append('<h3>1.3. Best datasets per model (CV)</h3>')
html_report.append(best_datasets[['model','metric','best_dataset','best_score','second_dataset','second_score','stat_significant','p_value','comment']].to_html(index=False, justify='center'))
html_report.append('<h3>1.4. Best models per metric (CV)</h3>')
html_report.append(best_models[['metric','best_model','best_score','second_model','second_score','stat_significant','p_value','comment']].to_html(index=False, justify='center'))

# ---- 2. Holdout bootstrap ----
html_report.append('<h2>2. Holdout bootstrap evaluation</h2>')
html_report.append('<h3>2.1. Overview of holdout metrics (bootstrap mean ±CI)</h3>')
html_report.append(test_metrics_df_vis.to_html(index=False, justify='center'))
html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
html_report.append('<h3>2.2. Bootstrap per embedding (test, per model+dataset)</h3>')
html_report.append(bootstrap_emb.to_html(index=False, justify='center'))
html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
html_report.append('<h3>2.3. Bootstrap per dataset (test, per model)</h3>')
html_report.append(bootstrap_ds.to_html(index=False, justify='center'))
html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
html_report.append('<h3>2.4. Bootstrap per model (test, all models)</h3>')
html_report.append(bootstrap_mod.to_html(index=False, justify='center'))
html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')

# ---- 3. Segmented Holdout bootstrap ----
html_report.append('<h2>3. Segmented Holdout bootstrap evaluation</h2>')
for seg, seg_name in [('high', 'High-price segment (top 15%)'), ('low', 'Low-price segment (bottom 85%)')]:
    html_report.append(f'<h3>3.1. Overview of holdout metrics ({seg_name}) (bootstrap mean ±CI)</h3>')
    html_report.append(segment_results[seg]['metrics_overview'].to_html(index=False, justify='center'))
    html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
    html_report.append(f'<h3>3.2. Test set bootstrap per embedding: {seg_name}</h3>')
    html_report.append(segment_results[seg]['bootstrap_emb'].to_html(index=False, justify='center'))
    html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
    html_report.append(f'<h3>3.3. Test set bootstrap per dataset: {seg_name}</h3>')
    html_report.append(segment_results[seg]['bootstrap_ds'].to_html(index=False, justify='center'))
    html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')
    html_report.append(f'<h3>3.4. Test set bootstrap per model: {seg_name}</h3>')
    html_report.append(segment_results[seg]['bootstrap_mod'].to_html(index=False, justify='center'))
    html_report.append('<div><em>RMSE приведен в миллионах рублей.</em></div><br>')

with open('model_comparison_report.html', 'w') as f:
    f.write('\n'.join(html_report))

print('HTML report saved as model_comparison_report.html')


In [None]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import friedmanchisquare, rankdata
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from jinja2 import Template
import warnings
warnings.filterwarnings('ignore')

# Функции для статистического анализа
def friedman_test(data):
    """Выполняет тест Фридмана на данных (n_samples, n_methods)"""
    try:
        stat, p = friedmanchisquare(*data.T)
        return stat, p
    except:
        return np.nan, np.nan

def nemenyi_posthoc(ranks, n_methods, n_samples):
    """Выполняет пост-хок тест Немени на основе рангов"""
    q_alpha = 2.569  # для alpha=0.05 и большого числа методов
    cd = q_alpha * np.sqrt(n_methods * (n_methods + 1) / (6 * n_samples))
    return cd

def wilcoxon_test(data1, data2):
    """Парный тест Вилкоксона с поправкой на связанные ранги"""
    try:
        stat, p = stats.wilcoxon(data1, data2)
        return stat, p
    except:
        return np.nan, np.nan

def calculate_ranks(data):
    """Вычисляет ранги для данных (n_samples, n_methods)"""
    return np.array([rankdata(row) for row in data])

def load_cv_metrics(files):
    """Загружает метрики кросс-валидации из всех файлов"""
    metrics = []

    for file in files:
        try:
            # Парсим информацию о модели и конфигурации из имени файла
            filename = os.path.basename(file)
            parts = filename.replace('metrics_', '').replace('_FINAL.csv', '').split('_')

            model = parts[0]
            dataset_type = None
            embedding = None

            # Определяем тип датасета и эмбеддинг
            if 'categorical-only' in filename:
                dataset_type = 'categorical-only'
                embedding = 'none'
            elif 'text-only' in filename:
                dataset_type = 'text-only'
                embedding = parts[-1] if parts[-1] in ['w2v', 'rubert', 'tfidf'] else 'unknown'
            elif 'mixed' in filename:
                dataset_type = 'mixed'
                embedding = parts[-1] if parts[-1] in ['w2v', 'rubert', 'tfidf'] else 'unknown'
            else:
                print(f"Не удалось определить тип датасета для файла: {filename}")
                continue

            # Читаем CSV файл
            df = pd.read_csv(file)

            # Извлекаем метрики кросс-валидации
            cv_metrics = df[df['type'] == 'cv_orig']

            if cv_metrics.empty:
                print(f"Не найдены метрики cv_orig в файле: {filename}")
                continue

            # Собираем все метрики
            for _, row in cv_metrics.iterrows():
                metric_name = row['metric']
                fold_values = [row[f'fold_{i}'] for i in range(5)]

                metrics.append({
                    'model': model,
                    'dataset_type': dataset_type,
                    'embedding': embedding,
                    'metric': metric_name,
                    'fold_0': fold_values[0],
                    'fold_1': fold_values[1],
                    'fold_2': fold_values[2],
                    'fold_3': fold_values[3],
                    'fold_4': fold_values[4],
                    'mean': row['mean'],
                    'std': row['std']
                })

        except Exception as e:
            print(f"Ошибка при обработке файла {file}: {str(e)}")
            continue

    return pd.DataFrame(metrics)

def load_test_predictions(files):
    """Загружает предсказания на тестовой выборке"""
    predictions = []

    for file in files:
        try:
            # Парсим информацию о модели и конфигурации из пути к файлу
            filename = os.path.basename(file)
            parts = filename.replace('predictions_test_price_prediction_', '').replace('.csv', '').split('_')

            model = parts[0]
            dataset_type = None
            embedding = None
            pca = None
            gate = None

            # Определяем тип датасета и эмбеддинг
            if 'categorical-only' in filename:
                dataset_type = 'categorical-only'
                embedding = 'none'
                pca = parts[5] == 'True'
                gate = parts[7] == 'True'
            elif 'text-only' in filename:
                dataset_type = 'text-only'
                embedding = parts[2]
                pca = parts[4] == 'True'
                gate = parts[6] == 'True'
            elif 'mixed' in filename:
                dataset_type = 'mixed'
                embedding = parts[2]
                pca = parts[4] == 'True'
                gate = parts[6] == 'True'
            else:
                print(f"Не удалось определить тип датасета для файла: {filename}")
                continue

            # Читаем CSV файл
            df = pd.read_csv(file)

            if 'true_price' not in df.columns or 'predicted_price' not in df.columns:
                print(f"Файл {filename} не содержит колонок true_price или predicted_price")
                continue

            # Вычисляем метрики качества
            true = df['true_price'].values
            pred = df['predicted_price'].values

            # R2 score
            r2 = 1 - np.sum((true - pred)**2) / np.sum((true - np.mean(true))**2)

            # RMSE
            rmse = np.sqrt(np.mean((true - pred)**2))

            # SMAPE
            smape = 100 * np.mean(2 * np.abs(pred - true) / (np.abs(pred) + np.abs(true)))

            # MedAPE
            medape = 100 * np.median(np.abs((true - pred) / true))

            predictions.append({
                'model': model,
                'dataset_type': dataset_type,
                'embedding': embedding,
                'pca': pca,
                'gate': gate,
                'r2': r2,
                'rmse': rmse,
                'smape': smape,
                'medape': medape,
                'file': filename
            })

        except Exception as e:
            print(f"Ошибка при обработке файла {file}: {str(e)}")
            continue

    return pd.DataFrame(predictions)

def compare_embeddings(cv_metrics_df):
    """Сравнивает техники эмбеддинга для каждой модели и типа датасета"""
    results = []

    # Группируем по модели и типу датасета
    groups = cv_metrics_df.groupby(['model', 'dataset_type', 'metric'])

    for (model, dataset_type, metric), group in groups:
        # Пропускаем если нет нескольких техник эмбеддинга для сравнения
        if len(group['embedding'].unique()) < 2:
            print(f"Недостаточно техник эмбеддинга для сравнения: model={model}, dataset={dataset_type}, metric={metric}")
            continue

        # Подготовка данных для теста Фридмана (5 фолдов x N техник эмбеддинга)
        embeddings = group['embedding'].unique()
        n_embeddings = len(embeddings)
        data = np.zeros((5, n_embeddings))  # 5 фолдов

        for i, emb in enumerate(embeddings):
            emb_data = group[group['embedding'] == emb]
            if len(emb_data) != 1:
                print(f"Ожидалась 1 строка, получено {len(emb_data)} для model={model}, dataset={dataset_type}, metric={metric}, embedding={emb}")
                continue

            data[:, i] = [
                emb_data.iloc[0]['fold_0'],
                emb_data.iloc[0]['fold_1'],
                emb_data.iloc[0]['fold_2'],
                emb_data.iloc[0]['fold_3'],
                emb_data.iloc[0]['fold_4']
            ]

        # Вычисляем ранги
        ranks = calculate_ranks(data)
        avg_ranks = np.mean(ranks, axis=0)

        # Тест Фридмана
        friedman_stat, friedman_p = friedman_test(data)

        # Критическая разница для теста Немени
        cd = nemenyi_posthoc(avg_ranks, n_embeddings, 5) if not np.isnan(friedman_stat) else np.nan

        # Сохраняем результаты
        for i, emb in enumerate(embeddings):
            results.append({
                'model': model,
                'dataset_type': dataset_type,
                'metric': metric,
                'embedding': emb,
                'mean_score': np.mean(data[:, i]),
                'std_score': np.std(data[:, i]),
                'rank': avg_ranks[i],
                'friedman_stat': friedman_stat,
                'friedman_p': friedman_p,
                'nemenyi_cd': cd
            })

    return pd.DataFrame(results)

def find_best_embeddings(embedding_comparison):
    """Определяет лучшие техники эмбеддинга на основе статистического сравнения"""
    best_embeddings = []

    groups = embedding_comparison.groupby(['model', 'dataset_type', 'metric'])

    for (model, dataset_type, metric), group in groups:
        # Если тест Фридмана не значим, выбираем с наилучшим средним
        if group['friedman_p'].iloc[0] >= 0.05:
            best = group.loc[group['mean_score'].idxmax() if metric == 'r2' else group['mean_score'].idxmin()]
            best_embeddings.append({
                'model': model,
                'dataset_type': dataset_type,
                'metric': metric,
                'best_embedding': best['embedding'],
                'reason': 'best_mean (no significant difference)',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })
        else:
            # Если тест значим, проверяем какие техники не отличаются значительно от лучшей
            min_rank = group['rank'].min()
            cd = group['nemenyi_cd'].iloc[0]

            # Все техники с рангом в пределах CD от минимального
            best_candidates = group[group['rank'] - min_rank <= cd]

            # Из них выбираем с наилучшим средним
            best = best_candidates.loc[best_candidates['mean_score'].idxmax() if metric == 'r2' else best_candidates['mean_score'].idxmin()]

            best_embeddings.append({
                'model': model,
                'dataset_type': dataset_type,
                'metric': metric,
                'best_embedding': best['embedding'],
                'reason': f'statistically best (p={best["friedman_p"]:.3f}, CD={cd:.2f})',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })

    return pd.DataFrame(best_embeddings)

def compare_datasets(cv_metrics_df, best_embeddings):
    """Сравнивает типы датасетов для каждой модели с использованием лучших техник эмбеддинга"""
    results = []

    # Фильтруем метрики, оставляя только лучшие техники эмбеддинга
    filtered_metrics = []

    for _, row in best_embeddings.iterrows():
        mask = (
            (cv_metrics_df['model'] == row['model']) &
            (cv_metrics_df['dataset_type'] == row['dataset_type']) &
            (cv_metrics_df['metric'] == row['metric']) &
            (cv_metrics_df['embedding'] == row['best_embedding'])
        )

        filtered = cv_metrics_df[mask]
        if not filtered.empty:
            filtered_metrics.append(filtered.iloc[0])
        else:
            print(f"Не найдены метрики для model={row['model']}, dataset={row['dataset_type']}, metric={row['metric']}, embedding={row['best_embedding']}")

    filtered_metrics_df = pd.DataFrame(filtered_metrics)

    # Группируем по модели и метрике
    groups = filtered_metrics_df.groupby(['model', 'metric'])

    for (model, metric), group in groups:
        # Пропускаем если нет нескольких типов датасетов для сравнения
        if len(group['dataset_type'].unique()) < 2:
            print(f"Недостаточно типов датасетов для сравнения: model={model}, metric={metric}")
            continue

        # Подготовка данных для теста Фридмана (5 фолдов x N типов датасетов)
        datasets = group['dataset_type'].unique()
        n_datasets = len(datasets)
        data = np.zeros((5, n_datasets))  # 5 фолдов

        for i, ds in enumerate(datasets):
            ds_data = group[group['dataset_type'] == ds]
            if len(ds_data) != 1:
                print(f"Ожидалась 1 строка, получено {len(ds_data)} для model={model}, metric={metric}, dataset={ds}")
                continue

            data[:, i] = [
                ds_data.iloc[0]['fold_0'],
                ds_data.iloc[0]['fold_1'],
                ds_data.iloc[0]['fold_2'],
                ds_data.iloc[0]['fold_3'],
                ds_data.iloc[0]['fold_4']
            ]

        # Вычисляем ранги
        ranks = calculate_ranks(data)
        avg_ranks = np.mean(ranks, axis=0)

        # Тест Фридмана
        friedman_stat, friedman_p = friedman_test(data)

        # Критическая разница для теста Немени
        cd = nemenyi_posthoc(avg_ranks, n_datasets, 5) if not np.isnan(friedman_stat) else np.nan

        # Сохраняем результаты
        for i, ds in enumerate(datasets):
            results.append({
                'model': model,
                'metric': metric,
                'dataset_type': ds,
                'mean_score': np.mean(data[:, i]),
                'std_score': np.std(data[:, i]),
                'rank': avg_ranks[i],
                'friedman_stat': friedman_stat,
                'friedman_p': friedman_p,
                'nemenyi_cd': cd
            })

    return pd.DataFrame(results)

def find_best_datasets(dataset_comparison):
    """Определяет лучшие типы датасетов на основе статистического сравнения"""
    best_datasets = []

    groups = dataset_comparison.groupby(['model', 'metric'])

    for (model, metric), group in groups:
        # Если тест Фридмана не значим, выбираем с наилучшим средним
        if group['friedman_p'].iloc[0] >= 0.05:
            best = group.loc[group['mean_score'].idxmax() if metric == 'r2' else group['mean_score'].idxmin()]
            best_datasets.append({
                'model': model,
                'metric': metric,
                'best_dataset': best['dataset_type'],
                'reason': 'best_mean (no significant difference)',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })
        else:
            # Если тест значим, проверяем какие типы не отличаются значительно от лучшего
            min_rank = group['rank'].min()
            cd = group['nemenyi_cd'].iloc[0]

            # Все типы с рангом в пределах CD от минимального
            best_candidates = group[group['rank'] - min_rank <= cd]

            # Из них выбираем с наилучшим средним
            best = best_candidates.loc[best_candidates['mean_score'].idxmax() if metric == 'r2' else best_candidates['mean_score'].idxmin()]

            best_datasets.append({
                'model': model,
                'metric': metric,
                'best_dataset': best['dataset_type'],
                'reason': f'statistically best (p={best["friedman_p"]:.3f}, CD={cd:.2f})',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })

    return pd.DataFrame(best_datasets)

def compare_models(cv_metrics_df, best_embeddings, best_datasets):
    """Сравнивает модели между собой с использованием их лучших конфигураций"""
    results = []

    # Фильтруем метрики, оставляя только лучшие конфигурации (эмбеддинг + датасет)
    filtered_metrics = []

    for _, row in best_datasets.iterrows():
        # Находим лучший эмбеддинг для этой модели и датасета
        best_emb = best_embeddings[
            (best_embeddings['model'] == row['model']) &
            (best_embeddings['dataset_type'] == row['best_dataset']) &
            (best_embeddings['metric'] == row['metric'])
        ]

        if best_emb.empty:
            print(f"Не найден лучший эмбеддинг для model={row['model']}, dataset={row['best_dataset']}, metric={row['metric']}")
            continue

        best_emb = best_emb.iloc[0]

        # Находим соответствующие метрики
        mask = (
            (cv_metrics_df['model'] == row['model']) &
            (cv_metrics_df['dataset_type'] == row['best_dataset']) &
            (cv_metrics_df['metric'] == row['metric']) &
            (cv_metrics_df['embedding'] == best_emb['best_embedding'])
        )

        filtered = cv_metrics_df[mask]
        if not filtered.empty:
            filtered_metrics.append(filtered.iloc[0])
        else:
            print(f"Не найдены метрики для model={row['model']}, dataset={row['best_dataset']}, metric={row['metric']}, embedding={best_emb['best_embedding']}")

    filtered_metrics_df = pd.DataFrame(filtered_metrics)

    # Группируем по метрике
    groups = filtered_metrics_df.groupby('metric')

    for metric, group in groups:
        # Пропускаем если нет нескольких моделей для сравнения
        if len(group['model'].unique()) < 2:
            print(f"Недостаточно моделей для сравнения по метрике: {metric}")
            continue

        # Подготовка данных для теста Фридмана (5 фолдов x N моделей)
        models = group['model'].unique()
        n_models = len(models)
        data = np.zeros((5, n_models))  # 5 фолдов

        for i, model in enumerate(models):
            model_data = group[group['model'] == model]
            if len(model_data) != 1:
                print(f"Ожидалась 1 строка, получено {len(model_data)} для metric={metric}, model={model}")
                continue

            data[:, i] = [
                model_data.iloc[0]['fold_0'],
                model_data.iloc[0]['fold_1'],
                model_data.iloc[0]['fold_2'],
                model_data.iloc[0]['fold_3'],
                model_data.iloc[0]['fold_4']
            ]

        # Вычисляем ранги
        ranks = calculate_ranks(data)
        avg_ranks = np.mean(ranks, axis=0)

        # Тест Фридмана
        friedman_stat, friedman_p = friedman_test(data)

        # Критическая разница для теста Немени
        cd = nemenyi_posthoc(avg_ranks, n_models, 5) if not np.isnan(friedman_stat) else np.nan

        # Сохраняем результаты
        for i, model in enumerate(models):
            results.append({
                'metric': metric,
                'model': model,
                'mean_score': np.mean(data[:, i]),
                'std_score': np.std(data[:, i]),
                'rank': avg_ranks[i],
                'friedman_stat': friedman_stat,
                'friedman_p': friedman_p,
                'nemenyi_cd': cd
            })

    return pd.DataFrame(results)

def find_best_models(model_comparison):
    """Определяет лучшие модели на основе статистического сравнения"""
    best_models = []

    groups = model_comparison.groupby('metric')

    for metric, group in groups:
        # Если тест Фридмана не значим, выбираем с наилучшим средним
        if group['friedman_p'].iloc[0] >= 0.05:
            best = group.loc[group['mean_score'].idxmax() if metric == 'r2' else group['mean_score'].idxmin()]
            best_models.append({
                'metric': metric,
                'best_model': best['model'],
                'reason': 'best_mean (no significant difference)',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })
        else:
            # Если тест значим, проверяем какие модели не отличаются значительно от лучшей
            min_rank = group['rank'].min()
            cd = group['nemenyi_cd'].iloc[0]

            # Все модели с рангом в пределах CD от минимального
            best_candidates = group[group['rank'] - min_rank <= cd]

            # Из них выбираем с наилучшим средним
            best = best_candidates.loc[best_candidates['mean_score'].idxmax() if metric == 'r2' else best_candidates['mean_score'].idxmin()]

            best_models.append({
                'metric': metric,
                'best_model': best['model'],
                'reason': f'statistically best (p={best["friedman_p"]:.3f}, CD={cd:.2f})',
                'mean_score': best['mean_score'],
                'friedman_p': best['friedman_p']
            })

    return pd.DataFrame(best_models)

def compare_test_predictions(test_predictions_df, best_embeddings, best_datasets):
    """Сравнивает предсказания на тестовой выборке для лучших конфигураций"""
    results = []

    # Фильтруем предсказания, оставляя только лучшие конфигурации
    filtered_predictions = []

    for _, row in best_datasets.iterrows():
        # Находим лучший эмбеддинг для этой модели и датасета
        best_emb = best_embeddings[
            (best_embeddings['model'] == row['model']) &
            (best_embeddings['dataset_type'] == row['best_dataset']) &
            (best_embeddings['metric'] == row['metric'])
        ]

        if best_emb.empty:
            print(f"Не найден лучший эмбеддинг для model={row['model']}, dataset={row['best_dataset']}, metric={row['metric']}")
            continue

        best_emb = best_emb.iloc[0]

        # Находим соответствующие предсказания на тесте
        mask = (
            (test_predictions_df['model'] == row['model']) &
            (test_predictions_df['dataset_type'] == row['best_dataset']) &
            (test_predictions_df['embedding'] == best_emb['best_embedding'])
        )

        filtered = test_predictions_df[mask]
        if not filtered.empty:
            # Берем первую попавшуюся конфигурацию (PCA и gate могут различаться)
            filtered_predictions.append(filtered.iloc[0])
        else:
            print(f"Не найдены предсказания для model={row['model']}, dataset={row['best_dataset']}, embedding={best_emb['best_embedding']}")

    filtered_predictions_df = pd.DataFrame(filtered_predictions)

    # Сравниваем модели по каждой метрике с помощью теста Дайболда-Мариано
    metrics = ['r2', 'rmse', 'smape', 'medape']
    model_pairs = []

    # Генерируем все уникальные пары моделей
    models = filtered_predictions_df['model'].unique()
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model_pairs.append((models[i], models[j]))

    for metric in metrics:
        for model1, model2 in model_pairs:
            # Получаем предсказания для обеих моделей
            pred1 = filtered_predictions_df[filtered_predictions_df['model'] == model1]
            pred2 = filtered_predictions_df[filtered_predictions_df['model'] == model2]

            if len(pred1) == 0 or len(pred2) == 0:
                print(f"Нет данных для сравнения {model1} vs {model2} по метрике {metric}")
                continue

            # Загружаем файлы с предсказаниями
            try:
                df1 = pd.read_csv(pred1.iloc[0]['file'])
                df2 = pd.read_csv(pred2.iloc[0]['file'])

                true = df1['true_price'].values
                pred1_vals = df1['predicted_price'].values
                pred2_vals = df2['predicted_price'].values

                # Вычисляем потери для теста Дайболда-Мариано
                if metric == 'r2':
                    # Для R2 используем квадраты ошибок
                    loss1 = (true - pred1_vals)**2
                    loss2 = (true - pred2_vals)**2
                elif metric == 'rmse':
                    loss1 = np.abs(true - pred1_vals)
                    loss2 = np.abs(true - pred2_vals)
                elif metric in ['smape', 'medape']:
                    loss1 = np.abs((true - pred1_vals) / true)
                    loss2 = np.abs((true - pred2_vals) / true)

                # Тест Дайболда-Мариано
                dm_stat, dm_p = stats.ttest_rel(loss1, loss2)

                results.append({
                    'metric': metric,
                    'model1': model1,
                    'model2': model2,
                    'model1_score': pred1.iloc[0][metric],
                    'model2_score': pred2.iloc[0][metric],
                    'dm_stat': dm_stat,
                    'dm_p': dm_p,
                    'significant': dm_p < 0.05
                })

            except Exception as e:
                print(f"Ошибка при сравнении {model1} vs {model2} по метрике {metric}: {str(e)}")
                continue

    return pd.DataFrame(results)

def generate_html_report(best_embeddings, best_datasets, best_models,
                        embedding_comparison, dataset_comparison, model_comparison,
                        test_comparison, cv_metrics_df, test_predictions_df):
    """Генерирует HTML отчет с результатами анализа"""

    # Создаем графики
    plots = {}

    # 1. Графики сравнения эмбеддингов
    for (model, dataset_type), group in embedding_comparison.groupby(['model', 'dataset_type']):
        for metric in group['metric'].unique():
            subset = group[group['metric'] == metric]
            plt.figure(figsize=(10, 6))
            sns.barplot(x='embedding', y='mean_score', data=subset)
            plt.title(f'{model} - {dataset_type} - {metric}\nFriedman p={subset["friedman_p"].iloc[0]:.3f}')
            plt.tight_layout()

            plot_name = f'embedding_{model}_{dataset_type}_{metric}'.replace('-', '_')
            plots[plot_name] = plt.gcf()
            plt.close()

    # 2. Графики сравнения датасетов
    for model, group in dataset_comparison.groupby('model'):
        for metric in group['metric'].unique():
            subset = group[group['metric'] == metric]
            plt.figure(figsize=(10, 6))
            sns.barplot(x='dataset_type', y='mean_score', data=subset)
            plt.title(f'{model} - {metric}\nFriedman p={subset["friedman_p"].iloc[0]:.3f}')
            plt.tight_layout()

            plot_name = f'dataset_{model}_{metric}'.replace('-', '_')
            plots[plot_name] = plt.gcf()
            plt.close()

    # 3. Графики сравнения моделей
    for metric in model_comparison['metric'].unique():
        subset = model_comparison[model_comparison['metric'] == metric]
        plt.figure(figsize=(10, 6))
        sns.barplot(x='model', y='mean_score', data=subset)
        plt.title(f'Model comparison - {metric}\nFriedman p={subset["friedman_p"].iloc[0]:.3f}')
        plt.tight_layout()

        plot_name = f'model_{metric}'
        plots[plot_name] = plt.gcf()
        plt.close()

    # Сохраняем графики в файлы
    plot_files = {}
    for name, fig in plots.items():
        filename = f'{name}.png'
        fig.savefig(filename)
        plot_files[name] = filename

    # Подготавливаем данные для отчета
    report_data = {
        'best_embeddings': best_embeddings.to_dict('records'),
        'best_datasets': best_datasets.to_dict('records'),
        'best_models': best_models.to_dict('records'),
        'test_comparison': test_comparison.to_dict('records'),
        'plot_files': plot_files
    }

    # HTML шаблон
    html_template = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Model Comparison Report</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            h1 { color: #2c3e50; }
            h2 { color: #3498db; margin-top: 30px; }
            h3 { color: #16a085; }
            table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
            th { background-color: #f2f2f2; }
            tr:nth-child(even) { background-color: #f9f9f9; }
            .plot { margin: 20px 0; text-align: center; }
            .plot img { max-width: 80%; border: 1px solid #ddd; }
            .significant { color: #e74c3c; font-weight: bold; }
        </style>
    </head>
    <body>
        <h1>Statistical Model Comparison Report</h1>

        <h2>1. Best Embedding Techniques</h2>
        <table>
            <tr>
                <th>Model</th>
                <th>Dataset Type</th>
                <th>Metric</th>
                <th>Best Embedding</th>
                <th>Mean Score</th>
                <th>Friedman p-value</th>
                <th>Selection Reason</th>
            </tr>
            {% for item in best_embeddings %}
            <tr>
                <td>{{ item.model }}</td>
                <td>{{ item.dataset_type }}</td>
                <td>{{ item.metric }}</td>
                <td>{{ item.best_embedding }}</td>
                <td>{{ "%.4f"|format(item.mean_score) }}</td>
                <td>{{ "%.3f"|format(item.friedman_p) }}</td>
                <td>{{ item.reason }}</td>
            </tr>
            {% endfor %}
        </table>

        {% for plot in plot_files if plot.startswith('embedding_') %}
        <div class="plot">
            <img src="{{ plot_files[plot] }}" alt="{{ plot }}">
        </div>
        {% endfor %}

        <h2>2. Best Dataset Types</h2>
        <table>
            <tr>
                <th>Model</th>
                <th>Metric</th>
                <th>Best Dataset</th>
                <th>Mean Score</th>
                <th>Friedman p-value</th>
                <th>Selection Reason</th>
            </tr>
            {% for item in best_datasets %}
            <tr>
                <td>{{ item.model }}</td>
                <td>{{ item.metric }}</td>
                <td>{{ item.best_dataset }}</td>
                <td>{{ "%.4f"|format(item.mean_score) }}</td>
                <td>{{ "%.3f"|format(item.friedman_p) }}</td>
                <td>{{ item.reason }}</td>
            </tr>
            {% endfor %}
        </table>

        {% for plot in plot_files if plot.startswith('dataset_') %}
        <div class="plot">
            <img src="{{ plot_files[plot] }}" alt="{{ plot }}">
        </div>
        {% endfor %}

        <h2>3. Best Models</h2>
        <table>
            <tr>
                <th>Metric</th>
                <th>Best Model</th>
                <th>Mean Score</th>
                <th>Friedman p-value</th>
                <th>Selection Reason</th>
            </tr>
            {% for item in best_models %}
            <tr>
                <td>{{ item.metric }}</td>
                <td>{{ item.best_model }}</td>
                <td>{{ "%.4f"|format(item.mean_score) }}</td>
                <td>{{ "%.3f"|format(item.friedman_p) }}</td>
                <td>{{ item.reason }}</td>
            </tr>
            {% endfor %}
        </table>

        {% for plot in plot_files if plot.startswith('model_') %}
        <div class="plot">
            <img src="{{ plot_files[plot] }}" alt="{{ plot }}">
        </div>
        {% endfor %}

        <h2>4. Test Set Comparisons</h2>
        <table>
            <tr>
                <th>Metric</th>
                <th>Model 1</th>
                <th>Model 2</th>
                <th>Model 1 Score</th>
                <th>Model 2 Score</th>
                <th>DM p-value</th>
                <th>Significant</th>
            </tr>
            {% for item in test_comparison %}
            <tr>
                <td>{{ item.metric }}</td>
                <td>{{ item.model1 }}</td>
                <td>{{ item.model2 }}</td>


# Tokens

In [None]:
import os
import json
import torch
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, AdamW
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional
import warnings

warnings.filterwarnings('ignore')

class RuBertTiny2Embedder(BaseEstimator, TransformerMixin):
    DEFAULT_MODEL_NAME = "cointegrated/rubert-tiny2"  # Фиксированная модель по умолчанию

    def __init__(
        self,
        max_length: int = 512,
        batch_size: int = 128,
        learning_rate: float = 2e-5,
        num_epochs: int = 5,
        device: Optional[str] = None,
        use_cv: bool = False,
        pooling_type: str = "cls",  # "cls", "mean", "max", "weighted"
        model_name: Optional[str] = None  # Опциональное переопределение модели
    ):
        assert pooling_type in ["cls", "mean", "max", "weighted"], \
            f"pooling_type must be one of: cls, mean, max, weighted. Got {pooling_type}"

        self.max_length = max_length
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.use_cv = use_cv
        self.pooling_type = pooling_type
        self.model_name = model_name or self.DEFAULT_MODEL_NAME  # Гарантированно правильное имя модели
        self.model = None
        self.tokenizer = None

    class _PricePredictionModel(nn.Module):
        def __init__(self, model_name: str, pooling_type: str):
            super().__init__()
            self.bert = AutoModel.from_pretrained(model_name)
            self.pooling_type = pooling_type
            self.hidden_size = self.bert.config.hidden_size
            self.regressor = nn.Linear(self.hidden_size, 1)

            if self.pooling_type == "weighted":
                self.attention = nn.Sequential(
                    nn.Linear(self.hidden_size, self.hidden_size),
                    nn.Tanh(),
                    nn.Linear(self.hidden_size, 1)
                )

        def forward(self, input_ids, attention_mask):
            outputs = self.bert(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_attentions=False,
                output_hidden_states=False
            )
            last_hidden = outputs.last_hidden_state

            if self.pooling_type == "cls":
                pooled = last_hidden[:, 0, :]

            elif self.pooling_type == "mean":
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
                sum_embeddings = torch.sum(last_hidden * input_mask_expanded, dim=1)
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                pooled = sum_embeddings / sum_mask

            elif self.pooling_type == "max":
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
                last_hidden[input_mask_expanded == 0] = -torch.inf
                pooled = torch.max(last_hidden, dim=1)[0]

            elif self.pooling_type == "weighted":
                weights = self.attention(last_hidden).squeeze(-1)
                weights = weights.masked_fill(attention_mask == 0, -torch.inf)
                weights = torch.softmax(weights, dim=1)
                pooled = torch.sum(last_hidden * weights.unsqueeze(-1), dim=1)

            return self.regressor(pooled), pooled

    class _TextPriceDataset(Dataset):
        def __init__(self, texts, prices, tokenizer, max_length):
            self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
            self.prices = prices.tolist() if hasattr(prices, 'tolist') else list(prices)
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            encoding = self.tokenizer(
                str(self.texts[idx]),
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'price': torch.tensor(float(self.prices[idx]), dtype=torch.float32)
            }

    def fit(self, X, y=None):
        """Fit model on training data"""
        # Преобразование входных данных
        X = self._convert_to_list(X)
        y = self._convert_to_list(y) if y is not None else None

        # Инициализация токенизатора и модели
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = self._PricePredictionModel(
            model_name=self.model_name,
            pooling_type=self.pooling_type
        ).to(self.device)

        if y is not None:
            self._train_model(X, y)

        return self

    def get_token_importance(self, texts, top_n=20):
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model not trained yet. Call fit() first.")
        if self.pooling_type != "weighted":
            raise ValueError("Token importance доступна только для weighted pooling")

        self.model.eval()
        all_importances = []

        for text in texts:
            # tokenize
            enc = self.tokenizer(
                text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).to(self.device)

            # get last hidden states
            with torch.no_grad():
                last_hidden = self.model.bert(
                    input_ids=enc['input_ids'],
                    attention_mask=enc['attention_mask']
                ).last_hidden_state  # (1, seq_len, hidden_size)

                # compute raw attention scores
                scores = self.model.attention(last_hidden)         # (1, seq_len, 1)
                scores = scores.squeeze(-1)                        # (1, seq_len)
                scores = scores.masked_fill(enc['attention_mask']==0, -1e9)
                weights = torch.softmax(scores, dim=1).cpu().numpy()[0]  # (seq_len,)

            # map back to tokens
            tokens = self.tokenizer.convert_ids_to_tokens(enc['input_ids'][0])
            token_scores = list(zip(tokens, weights))

            # take the top_n and strip “##” prefixes
            top = sorted(token_scores, key=lambda x: x[1], reverse=True)[:top_n]
            top = [(t.replace('##', ''), float(w)) for t, w in top if len(t.replace('##', '')) >= 4]
            all_importances.append(top)

        return all_importances


    def _convert_to_list(self, data):
        """Convert input data to list"""
        if data is None:
            return None
        if hasattr(data, 'iloc'):  # pandas DataFrame/Series
            return data.iloc[:, 0].tolist() if data.ndim > 1 else data.tolist()
        if hasattr(data, 'tolist'):
            return data.tolist()
        return list(data)

    def _train_model(self, X, y):
        """Internal training procedure"""
        dataset = self._TextPriceDataset(X, y, self.tokenizer, self.max_length)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        loss_fn = nn.MSELoss()

        self.model.train()
        for epoch in range(self.num_epochs):
            total_loss = 0
            for batch in dataloader:
                optimizer.zero_grad()
                inputs = {k: v.to(self.device) for k, v in batch.items()}
                predictions, _ = self.model(inputs['input_ids'], inputs['attention_mask'])
                loss = loss_fn(predictions.squeeze(), inputs['price'])
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch+1}/{self.num_epochs} - Loss: {total_loss/len(dataloader):.4f}")

    def transform(self, X):
        """Generate embeddings for input texts"""
        X = self._convert_to_list(X)

        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Model not trained. Call fit() first.")

        dataset = self._TextPriceDataset(X, [0]*len(X), self.tokenizer, self.max_length)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

        self.model.eval()
        embeddings = []
        with torch.no_grad():
            for batch in dataloader:
                inputs = {k: v.to(self.device) for k, v in batch.items()}
                _, emb = self.model(inputs['input_ids'], inputs['attention_mask'])
                embeddings.append(emb.cpu().numpy())

        return np.concatenate(embeddings, axis=0)

    def save(self, path: str):
        """Save model to directory"""
        os.makedirs(path, exist_ok=True)

        # Сохраняем компоненты модели
        self.model.bert.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

        # Сохраняем конфигурацию
        config = {
            'max_length': self.max_length,
            'batch_size': self.batch_size,
            'learning_rate': self.learning_rate,
            'num_epochs': self.num_epochs,
            'device': str(self.device),
            'use_cv': self.use_cv,
            'pooling_type': self.pooling_type,
            'model_name': self.model_name
        }
        with open(os.path.join(path, 'config.json'), 'w') as f:
            json.dump(config, f, indent=2)

        # Сохраняем веса головки
        state_dict = {
            'regressor_state_dict': self.model.regressor.state_dict()
        }
        if self.pooling_type == "weighted":
            state_dict['attention_state_dict'] = self.model.attention.state_dict()

        torch.save(state_dict, os.path.join(path, 'head_weights.pt'))

    @classmethod
    def load(cls, path: str):
        """Load model from directory"""
        with open(os.path.join(path, 'config.json')) as f:
            config = json.load(f)

        # Создаем экземпляр с сохраненными параметрами
        instance = cls(
            max_length=config['max_length'],
            batch_size=config['batch_size'],
            learning_rate=config['learning_rate'],
            num_epochs=config['num_epochs'],
            device=config['device'],
            use_cv=config.get('use_cv', False),
            pooling_type=config['pooling_type'],
            model_name=config['model_name']  # Важно: используем сохраненное имя модели
        )

        # Загружаем токенизатор и модель
        instance.tokenizer = AutoTokenizer.from_pretrained(path)
        instance.model = instance._PricePredictionModel(
            model_name=path,  # Загружаем из локальной папки
            pooling_type=config['pooling_type']
        ).to(instance.device)

        # Загружаем веса головки
        head_weights = torch.load(
            os.path.join(path, 'head_weights.pt'),
            map_location=instance.device
        )
        instance.model.regressor.load_state_dict(head_weights['regressor_state_dict'])
        if config['pooling_type'] == "weighted":
            instance.model.attention.load_state_dict(head_weights['attention_state_dict'])

        return instance

In [None]:
path = '/content/drive/MyDrive/price_prediction_data/raw_data_parsed_desc_preprocessed_full.csv'

df = pd.read_csv(path)

process = DataProcessingPipeline(
            df,
            log_needed=True,
            norm_needed=True,
            one_hot_only=True,
            train=True,
            use_hex_features=True,
            hex_resolution=10
        )
df = process.preprocess_base()


df['price_category'] = pd.qcut(df['price'], q=10, labels=False)

# Теперь можно стратифицировать по price_category
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['price_category']
)



train_process = DataProcessingPipeline(
            df,
            log_needed=True,
            norm_needed=True,
            one_hot_only=True,
            train=True,
            use_hex_features=True,
            hex_resolution=10
      )
train_result = train_process.prepare_for_model()
train_processed = train_result['processed_df']

print(list(train_processed.columns))

# Обработка тестовых данных
test_process = DataProcessingPipeline(
    test_df,
    log_needed=True,
    norm_needed=True,
    one_hot_only=True,
    train=False,
    outlier_bounds=train_result['outlier_bounds'],
    scaler=train_result['scaler'],
    lat_long_scaler=train_result['lat_long_scaler'],
    use_hex_features=True,
    hex_resolution=10,
    hex_stats=train_process.hex_stats,
    global_median_ppsm=train_process.global_median_ppsm,
    global_median_ppland=train_process.global_median_ppland,
    global_median_price=train_process.global_median_price
)
test_processed = test_process.prepare_for_model()

In [None]:
X_train = train_processed['description_raw']
y_train = train_processed['price']


bert_embedder = RuBertTiny2Embedder(
    max_length=512,
    batch_size=8,
    pooling_type='weighted',  # ключевое изменение!
    learning_rate=2.196386920803346e-05,
    num_epochs=5
)

# 2. Обучение на тренировочных данных
bert_embedder.fit(X_train, y_train)



In [None]:
# bert_embedder = RuBertTiny2Embedder(
#     max_length=512,
#     batch_size=8,
#     pooling_type='weighted',
#     learning_rate=2.2e-5,
#     num_epochs=5      # just 1 epoch for testing
# )
# bert_embedder.fit(X_train, y_train)

In [None]:
def filter_tokens(token_weights):
    filtered = []
    for token, weight in token_weights:
        # Приводим токен к нижнему регистру перед проверками
        lower_token = token.lower()
        # Исключаем: служебные токены, короткие, цифры
        if (token not in ['[CLS]', '[SEP]', '[PAD]', '[UNK]', '[MASK]']) and \
           (len(lower_token.replace('##', '')) >= 4) and \
           (not lower_token.replace('##', '').isdigit()):
            filtered.append((lower_token.replace('##', ''), weight))
    return filtered

token_stats = {}
sample_texts = X_train.head(1000).tolist()

for text in tqdm(sample_texts, desc="Анализ важности токенов"):
    tw = bert_embedder.get_token_importance([text])[0]
    for tok, w in filter_tokens(tw):
        # Используем lower() для ключа в словаре
        lower_tok = tok.lower()
        entry = token_stats.setdefault(lower_tok, {'count':0, 'total_weight':0.0})
        entry['count'] += 1
        entry['total_weight'] += w

print(f"Collected stats for {len(token_stats)} tokens.")
if not token_stats:
    raise RuntimeError(
        "token_stats is empty — your get_token_importance() returned nothing!"
    )

# Build DataFrame
df_tokens = pd.DataFrame.from_dict(token_stats, orient='index')
df_tokens['avg_weight'] = df_tokens['total_weight'] / df_tokens['count']

In [None]:
df_tokens = pd.DataFrame.from_dict(token_stats, orient='index')
df_tokens['avg_weight'] = df_tokens['total_weight'] / df_tokens['count']

df_tokens.sort_values(['avg_weight'], ascending = False).head(30)

In [None]:
df_tokens.sort_values(['avg_weight'], ascending = False)

In [None]:

# 6. Визуализация
plt.figure(figsize=(15, 10))
sns.barplot(x='avg_weight', y=df_tokens.index, data=df_tokens, palette='viridis')
plt.title('Топ-30 важных токенов (длина ≥4, без служебных)')
plt.xlabel('Средний вес внимания')
plt.ylabel('Токен')
plt.tight_layout()
plt.savefig('top_tokens.png', dpi=300)
plt.show()

# 7. Вывод таблицы
print("Таблица топ-30 токенов:")
display(df_tokens[['count', 'avg_weight']].style.background_gradient(cmap='Blues'))

In [None]:
from collections import defaultdict
from tqdm import tqdm  # Импортируем tqdm для прогресс-бара
import re

def filter_tokens(token_weights):
    filtered = []
    for token, weight in token_weights:
        lower_token = token.lower()
        # Менее строгие условия
        if (token not in ['[CLS]', '[SEP]', '[PAD]', '[UNK]', '[MASK]', '[sep]', '[SEP]'])  and \
           (len(lower_token.replace('##', '')) >= 2) and \
           (not re.fullmatch(r'\d+', lower_token.replace('##', ''))):  # Только чисто цифровые
            clean_token = lower_token.replace('##', '')
            filtered.append((clean_token, weight))
    return filtered


token_stats = defaultdict(lambda: {'count': 0, 'total_weight': 0.0})
sample_texts = X_train.tolist()

# Добавляем tqdm для отображения прогресса
for text in tqdm(sample_texts, desc="Анализ токенов", unit="текст"):
    tw = bert_embedder.get_token_importance([text])[0]
    current_word = []
    current_weight = 0
    for tok, w in tw:
        if tok.startswith('##'):
            current_word.append(tok[2:])
            current_weight += w
        else:
            if current_word:  # Сохраняем предыдущее слово
                full_word = ''.join(current_word)
                if len(full_word) >= 3:  # Проверяем длину полного слова
                    entry = token_stats[full_word.lower()]
                    entry['count'] += 1
                    entry['total_weight'] += current_weight
            current_word = [tok.lower()]
            current_weight = w
    # Не забыть добавить последнее слово
    if current_word and len(''.join(current_word)) >= 3:
        full_word = ''.join(current_word)
        entry = token_stats[full_word.lower()]
        entry['count'] += 1
        entry['total_weight'] += current_weight

In [None]:
df_tokens = pd.DataFrame.from_dict(token_stats, orient='index')


df_tokens[df_tokens.index == 'торг']

In [None]:
import seaborn as sns  # Добавляем импорт seaborn с псевдонимом sns

# Создаем DataFrame с результатами
df_tokens = pd.DataFrame.from_dict(token_stats, orient='index')
df_tokens = df_tokens[df_tokens.index!='[sep]']
df_tokens['avg_weight'] = df_tokens['total_weight'] / df_tokens['count']
df_tokens = df_tokens.sort_values('avg_weight', ascending=False)

# Фильтруем редкие токены
df_tokens = df_tokens[df_tokens['count'] > 20]  # Только токены, встречающиеся >5 раз

# Топ-30 токенов
top_tokens = df_tokens.head(30)
print("Топ-30 важных токенов:")
print(top_tokens[['count', 'avg_weight']].to_markdown())

# Визуализация
plt.figure(figsize=(12, 8))
sns.set_style("whitegrid")

# График важности токенов
ax = sns.barplot(x='avg_weight', y=top_tokens.index, data=top_tokens, palette="viridis")
plt.title('Top-30 most important tokens by their average weight', fontsize=14)
plt.xlabel('Average weight', fontsize=12)
plt.ylabel('Token', fontsize=12)
plt.tight_layout()

# Добавляем значения на график
for i, (_, row) in enumerate(top_tokens.iterrows()):
    ax.text(row['avg_weight'] + 0.005, i, f"{row['avg_weight']:.3f}",
            va='center', fontsize=9)

plt.show()

# Дополнительная визуализация: зависимость веса от частоты
plt.figure(figsize=(12, 8))  # Increased size for better readability
sns.scatterplot(x='count', y='avg_weight', size='count', sizes=(20, 200),
                data=top_tokens, hue=top_tokens.index, legend=False)
plt.title('Token Importance vs. Frequency', fontsize=14)
plt.xlabel('Token Frequency (count)', fontsize=12)
plt.ylabel('Average Weight', fontsize=12)

# Add labels for all points with count >= 20
for line in range(top_tokens.shape[0]):
    if top_tokens['count'].iloc[line] >= 20:
        plt.text(top_tokens['count'].iloc[line] + 3,  # X-offset for readability
                top_tokens['avg_weight'].iloc[line],
                top_tokens.index[line],
                horizontalalignment='left',
                verticalalignment='center',
                size=9,
                color='black',
                bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', boxstyle='round,pad=0.2'))

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from tqdm import tqdm

def calculate_avg_tokens(texts, tokenizer, max_length=512):
    """Вычисляет среднее количество токенов в текстах после токенизации с прогресс-баром"""
    token_counts = []

    # Добавляем прогресс-бар
    for text in tqdm(texts, desc="Токенизация текстов", unit="текст"):
        # Токенизируем текст без учета padding
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_length,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=False,
            return_length=True
        )
        token_counts.append(encoded['length'])

    avg_tokens = np.mean(token_counts)
    print(f"\nРезультаты токенизации:")
    print(f"Среднее количество токенов на текст: {avg_tokens:.1f} (±{np.std(token_counts):.1f})")
    print(f"Минимум: {np.min(token_counts)}, Максимум: {np.max(token_counts)}")
    print(f"25-й перцентиль: {np.percentile(token_counts, 25):.1f}")
    print(f"Медиана: {np.median(token_counts):.1f}")
    print(f"75-й перцентиль: {np.percentile(token_counts, 75):.1f}")
    return avg_tokens

# Пример использования:
avg_tokens = calculate_avg_tokens(
    texts=X_train,  # Ваши текстовые данные
    tokenizer=bert_embedder.tokenizer,  # Токенизатор из вашего RuBertTiny2Embedder
    max_length=bert_embedder.max_length  # Макс. длина из конфига
)

# MTF

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Функции для загрузки данных
def load_summary_metrics(filepath):
    df = pd.read_csv(filepath)
    metrics = {}
    for _, row in df.iterrows():
        if 'orig' in row['type']:
            metrics[row['metric']] = {
                'mean': row['mean'],
                'std': row['std'],
                'conf_interval': row['conf_interval'],
                'value': row['value']
            }
    return metrics

def load_cv_metrics(filepath):
    with open(filepath) as f:
        data = json.load(f)
    return data['orig_metrics']

def load_predictions(filepath):
    return pd.read_csv(filepath)

import os
from pathlib import Path

# Укажите базовый путь к вашим данным
base_path = "/content/drive/MyDrive/price_prediction_data/"

# Обновляем пути к файлам в словаре models
models = {
    'no_text': {
        'summary': os.path.join(base_path, "Results_pca/XGBR/metrics/summary_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.csv"),
        'cv': os.path.join(base_path, "Results_pca/XGBR/metrics/cv_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.json"),
        'predictions': os.path.join(base_path, "Results_pca/XGBR/metrics/predictions_test_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.csv")
    },
    'all_text': {
        'summary': os.path.join(base_path, "manual_text_features/XGBR/metrics/summary_metrics_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv"),
        'cv': os.path.join(base_path, "manual_text_features/XGBR/metrics/cv_metrics_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.json"),
        'predictions': os.path.join(base_path, "manual_text_features/XGBR/metrics/predictions_test_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv")
    },
    'neg_text': {
        'summary': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/summary_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv"),
        'cv': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/cv_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.json"),
        'predictions': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/predictions_test_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv")
    }
}

# Проверяем существование файлов перед загрузкой
for model_name in models:
    for file_type in ['summary', 'cv', 'predictions']:
        file_path = models[model_name][file_type]
        if not os.path.exists(file_path):
            print(f"Файл не найден: {file_path}")
            # Можно либо пропустить этот файл, либо завершить выполнение
            # В данном примере просто выводим предупреждение


# Загружаем данные
for model_name in models:
    models[model_name]['summary_metrics'] = load_summary_metrics(models[model_name]['summary'])
    models[model_name]['cv_metrics'] = load_cv_metrics(models[model_name]['cv'])
    models[model_name]['predictions'] = load_predictions(models[model_name]['predictions'])

# Функция для бутстрапа
def bootstrap_metric(y_true, y_pred, metric_func, n_bootstrap=1000, ci=95):
    bootstrapped_scores = []
    rng = np.random.RandomState(42)
    for _ in range(n_bootstrap):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    mean_score = np.mean(bootstrapped_scores)
    ci_lower = np.percentile(bootstrapped_scores, (100 - ci) / 2)
    ci_upper = np.percentile(bootstrapped_scores, ci + (100 - ci) / 2)
    return mean_score, (ci_lower, ci_upper)

# Функции метрик
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def medape(y_true, y_pred):
    return 100 * np.median(np.abs(y_pred - y_true) / np.abs(y_true))

# Создаем таблицы сравнения
def create_comparison_table(models, metric_names, test_data=False):
    comparison = []

    for metric in metric_names:
        row = {'metric': metric}

        for model_name in models:
            if test_data:
                # Для тестовых данных используем бутстрап
                y_true = models[model_name]['predictions']['true_price'].values
                y_pred = models[model_name]['predictions']['predicted_price'].values

                if metric == 'r2':
                    metric_func = lambda y_true, y_pred: 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
                elif metric == 'rmse':
                    metric_func = lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2))
                elif metric == 'smape':
                    metric_func = smape
                elif metric == 'medape':
                    metric_func = medape

                mean_val, (ci_lower, ci_upper) = bootstrap_metric(y_true, y_pred, metric_func)
                conf_interval = (mean_val - ci_lower)  # полуширина интервала
                row[model_name] = f"{mean_val:.4f} ± {conf_interval:.4f}"
            else:
                # Для CV берем из загруженных данных
                mean_val = models[model_name]['cv_metrics'][metric]['mean']
                conf_interval = models[model_name]['cv_metrics'][metric]['conf_interval']
                row[model_name] = f"{mean_val:.4f} ± {conf_interval:.4f}"

        comparison.append(row)

    return pd.DataFrame(comparison)

# Функция для проверки статистической значимости
def add_statistical_significance(df, models, metric_names, test_data=False):
    for metric in metric_names:
        # Сравниваем модели с текстовыми фичами с моделью без текстовых фичей
        for compare_model in ['all_text', 'neg_text']:
            if test_data:
                # Для тестовых данных используем пермутационный тест
                y_true_no = models['no_text']['predictions']['true_price'].values
                y_pred_no = models['no_text']['predictions']['predicted_price'].values
                y_true_comp = models[compare_model]['predictions']['true_price'].values
                y_pred_comp = models[compare_model]['predictions']['predicted_price'].values

                if metric == 'r2':
                    metric_func = lambda y_true, y_pred: 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
                elif metric == 'rmse':
                    metric_func = lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2))
                elif metric == 'smape':
                    metric_func = smape
                elif metric == 'medape':
                    metric_func = medape

                # Пермутационный тест
                n_permutations = 1000
                original_diff = metric_func(y_true_comp, y_pred_comp) - metric_func(y_true_no, y_pred_no)

                combined = np.concatenate([y_pred_no, y_pred_comp])
                perm_diffs = []

                for _ in range(n_permutations):
                    np.random.shuffle(combined)
                    perm_pred_no = combined[:len(y_pred_no)]
                    perm_pred_comp = combined[len(y_pred_no):]

                    perm_diff = metric_func(y_true_comp, perm_pred_comp) - metric_func(y_true_no, perm_pred_no)
                    perm_diffs.append(perm_diff)

                p_value = (np.abs(np.array(perm_diffs)) >= np.abs(original_diff)).mean()
            else:
                # Для CV используем t-test для парных выборок
                no_text_values = models['no_text']['cv_metrics'][metric]['values']
                comp_values = models[compare_model]['cv_metrics'][metric]['values']
                _, p_value = stats.ttest_rel(no_text_values, comp_values)

            # Добавляем звездочки в зависимости от p-value
            if p_value < 0.001:
                sig = '***'
            elif p_value < 0.01:
                sig = '**'
            elif p_value < 0.05:
                sig = '*'
            else:
                sig = ''

            # Добавляем обозначение значимости в таблицу
            mask = df['metric'] == metric
            df.loc[mask, compare_model] = df.loc[mask, compare_model].values[0] + sig

    return df

# Метрики для сравнения
metric_names = ['r2', 'rmse', 'smape', 'medape']

# Создаем таблицу для CV метрик
cv_comparison = create_comparison_table(models, metric_names, test_data=False)
cv_comparison = add_statistical_significance(cv_comparison, models, metric_names, test_data=False)

# Создаем таблицу для тестовых метрик
test_comparison = create_comparison_table(models, metric_names, test_data=True)
test_comparison = add_statistical_significance(test_comparison, models, metric_names, test_data=True)

# Выводим результаты
print("Сравнение метрик кросс-валидации (оригинальная шкала):")
print(cv_comparison.to_markdown(index=False))

print("\nСравнение метрик тестовой выборки (оригинальная шкала):")
print(test_comparison.to_markdown(index=False))

# Визуализация результатов
def plot_metric_comparison(df, title):
    plt.figure(figsize=(12, 6))
    metrics = df['metric'].unique()

    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        for model in ['no_text', 'all_text', 'neg_text']:
            # Удаляем символы статистической значимости перед преобразованием
            val_err = df[df['metric'] == metric][model].str.replace(r'[*]+', '', regex=True).str.split('±')
            vals = val_err.str[0].astype(float)
            errs = val_err.str[1].str.strip().astype(float)  # Удаляем пробелы в начале

            plt.bar(model, vals, yerr=errs, capsize=5, label=model)

        plt.title(metric)
        plt.ylabel('Value')

    plt.suptitle(title)
    plt.tight_layout()
    plt.legend()
    plt.show()

plot_metric_comparison(cv_comparison, "CV Metrics Comparison")
plot_metric_comparison(test_comparison, "Test Metrics Comparison")

In [None]:
import pandas as pd
import numpy as np
import json
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from IPython.display import HTML, display
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import io
import base64

# ======================
# DATA LOADING FUNCTIONS
# ======================

def load_summary_metrics(filepath):
    """Load summary metrics from CSV file"""
    df = pd.read_csv(filepath)
    metrics = {}
    for _, row in df.iterrows():
        if 'orig' in row['type']:
            metrics[row['metric']] = {
                'mean': row['mean'],
                'std': row['std'],
                'conf_interval': row['conf_interval'],
                'value': row['value']
            }
    return metrics

def load_cv_metrics(filepath):
    """Load cross-validation metrics from JSON file"""
    with open(filepath) as f:
        data = json.load(f)
    return data['orig_metrics']

def load_predictions(filepath):
    """Load prediction results from CSV file"""
    return pd.read_csv(filepath)

# =================
# CONFIGURATION
# =================

# Base path configuration
base_path = "/content/drive/MyDrive/price_prediction_data/"

# Model configuration with proper paths
models = {
    'Baseline': {
        'summary': os.path.join(base_path, "Results_pca/XGBR/metrics/summary_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.csv"),
        'cv': os.path.join(base_path, "Results_pca/XGBR/metrics/cv_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.json"),
        'predictions': os.path.join(base_path, "Results_pca/XGBR/metrics/predictions_test_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False.csv")
    },
    'All Text Features': {
        'summary': os.path.join(base_path, "manual_text_features/XGBR/metrics/summary_metrics_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv"),
        'cv': os.path.join(base_path, "manual_text_features/XGBR/metrics/cv_metrics_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.json"),
        'predictions': os.path.join(base_path, "manual_text_features/XGBR/metrics/predictions_test_price_prediction_mannual_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv")
    },
    'Negative Text Only': {
        'summary': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/summary_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv"),
        'cv': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/cv_metrics_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.json"),
        'predictions': os.path.join(base_path, "manual_text_features_negative/XGBR/metrics/predictions_test_price_prediction_XGBRegressor_categorical-only_none_pca_False_gate_False_mtf_True_.csv")
    }
}

# =====================
# METRIC CALCULATIONS
# =====================

def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def medape(y_true, y_pred):
    """Median Absolute Percentage Error"""
    return 100 * np.median(np.abs(y_pred - y_true) / np.abs(y_true))

def bootstrap_metric(y_true, y_pred, metric_func, n_bootstrap=1000, ci=95):
    """Calculate bootstrap confidence intervals"""
    bootstrapped_scores = []
    rng = np.random.RandomState(42)
    for _ in range(n_bootstrap):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    mean_score = np.mean(bootstrapped_scores)
    ci_lower = np.percentile(bootstrapped_scores, (100 - ci) / 2)
    ci_upper = np.percentile(bootstrapped_scores, ci + (100 - ci) / 2)
    return mean_score, (ci_lower, ci_upper)

# =====================
# DATA PROCESSING
# =====================

def create_comparison_table(models, metric_names, test_data=False):
    """Create comparison table with confidence intervals"""
    comparison = []
    metric_labels = {
        'r2': 'R² Score',
        'rmse': 'RMSE (million RUB)',  # Updated to indicate millions of rubles
        'smape': 'SMAPE (%)',
        'medape': 'MedAPE (%)'
    }

    for metric in metric_names:
        row = {'Metric': metric_labels.get(metric, metric)}

        for model_name in models:
            if test_data:
                y_true = models[model_name]['predictions']['true_price'].values
                y_pred = models[model_name]['predictions']['predicted_price'].values

                if metric == 'r2':
                    metric_func = lambda y_true, y_pred: 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
                elif metric == 'rmse':
                    metric_func = lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2)) / 1e6  # Convert to millions
                elif metric == 'smape':
                    metric_func = smape
                elif metric == 'medape':
                    metric_func = medape

                mean_val, (ci_lower, ci_upper) = bootstrap_metric(y_true, y_pred, metric_func)
                conf_interval = (mean_val - ci_lower)
                row[model_name] = f"{mean_val:.2f} ± {conf_interval:.2f}"  # Reduced to 2 decimal places for RUB millions
            else:
                if metric == 'rmse':
                    # Convert CV RMSE values to millions
                    mean_val = models[model_name]['cv_metrics'][metric]['mean'] / 1e6
                    conf_interval = models[model_name]['cv_metrics'][metric]['conf_interval'] / 1e6
                else:
                    mean_val = models[model_name]['cv_metrics'][metric]['mean']
                    conf_interval = models[model_name]['cv_metrics'][metric]['conf_interval']
                row[model_name] = f"{mean_val:.2f} ± {conf_interval:.2f}"

        comparison.append(row)

    return pd.DataFrame(comparison)

def add_statistical_significance(df, models, metric_names, test_data=False):
    """Add statistical significance markers to results"""
    p_values_df = df.copy()
    metric_labels = {
        'r2': 'R² Score',
        'rmse': 'RMSE (million RUB)',
        'smape': 'SMAPE (%)',
        'medape': 'MedAPE (%)'
    }

    for metric in metric_names:
        current_metric_label = metric_labels.get(metric, metric)
        p_values = {'Metric': current_metric_label}

        for compare_model in ['All Text Features', 'Negative Text Only']:
            if test_data:
                y_true_base = models['Baseline']['predictions']['true_price'].values
                y_pred_base = models['Baseline']['predictions']['predicted_price'].values
                y_true_comp = models[compare_model]['predictions']['true_price'].values
                y_pred_comp = models[compare_model]['predictions']['predicted_price'].values

                if metric == 'r2':
                    metric_func = lambda y_true, y_pred: 1 - np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)
                elif metric == 'rmse':
                    # Convert to millions for comparison
                    metric_func = lambda y_true, y_pred: np.sqrt(np.mean((y_true - y_pred)**2)) / 1e6
                elif metric == 'smape':
                    metric_func = smape
                elif metric == 'medape':
                    metric_func = medape

                n_permutations = 1000
                original_diff = metric_func(y_true_comp, y_pred_comp) - metric_func(y_true_base, y_pred_base)
                combined = np.concatenate([y_pred_base, y_pred_comp])
                perm_diffs = []

                for _ in tqdm(range(n_permutations), desc=f"{metric} ({compare_model})"):
                    np.random.shuffle(combined)
                    perm_pred_base = combined[:len(y_pred_base)]
                    perm_pred_comp = combined[len(y_pred_base):]
                    perm_diff = metric_func(y_true_comp, perm_pred_comp) - metric_func(y_true_base, perm_pred_base)
                    perm_diffs.append(perm_diff)

                p_value = (np.abs(np.array(perm_diffs)) >= np.abs(original_diff)).mean()
            else:
                base_values = models['Baseline']['cv_metrics'][metric]['values']
                comp_values = models[compare_model]['cv_metrics'][metric]['values']

                if metric == 'rmse':
                    # Convert to millions for CV comparison
                    base_values = [x / 1e6 for x in base_values]
                    comp_values = [x / 1e6 for x in comp_values]

                _, p_value = stats.ttest_rel(base_values, comp_values)

            p_values[compare_model] = p_value

            if p_value < 0.001:
                sig = '***'
            elif p_value < 0.01:
                sig = '**'
            elif p_value < 0.05:
                sig = '*'
            else:
                sig = ''

            mask = df['Metric'] == current_metric_label
            df.loc[mask, compare_model] = df.loc[mask, compare_model].values[0] + sig

        if metric == metric_names[0]:
            p_values_df = pd.DataFrame([p_values])
        else:
            p_values_df = pd.concat([p_values_df, pd.DataFrame([p_values])], ignore_index=True)

    return df, p_values_df

# =====================
# VISUALIZATION
# =====================

def plot_metric_comparison(df, p_values_df, title, figure_num):
    """Create comparison plots with significance markers"""
    plt.figure(figsize=(14, 8))
    metrics = df['Metric'].unique()

    for i, metric in enumerate(metrics):
        plt.subplot(2, 2, i+1)
        for model in ['Baseline', 'All Text Features', 'Negative Text Only']:
            clean_val = df[df['Metric'] == metric][model].str.replace(r'[*]+', '', regex=True)
            val_err = clean_val.str.split('±')
            vals = val_err.str[0].astype(float)
            errs = val_err.str[1].str.strip().astype(float)

            colors = {
                'Baseline': '#3498db',
                'All Text Features': '#e74c3c',
                'Negative Text Only': '#27ae60'
            }

            bar = plt.bar(model, vals, yerr=errs, capsize=5, label=model,
                         color=colors[model], alpha=0.7, width=0.6)

            if model in ['All Text Features', 'Negative Text Only']:
                p_val = p_values_df[p_values_df['Metric'] == metric][model].values[0]
                if p_val < 0.05:
                    height = vals.values[0] + errs.values[0] + 0.05 * vals.values[0]
                    plt.text(model, height, f'p={p_val:.3f}', ha='center',
                            va='bottom', fontsize=9, bbox=dict(facecolor='white', alpha=0.8))

        plt.title(f"{metric}", fontsize=12)

        # Add million RUB label for RMSE
        ylabel = 'Value (million RUB)' if 'RMSE' in metric else 'Value'
        plt.ylabel(ylabel, fontsize=10)

        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.xticks(rotation=15)

    plt.suptitle(f"Figure {figure_num}: {title}", fontsize=14, y=1.02)
    plt.tight_layout()
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Save figure to buffer for HTML report
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=120, bbox_inches='tight')
    plt.close()
    return base64.b64encode(buf.getvalue()).decode('utf-8')

# =====================
# REPORT GENERATION
# =====================

def create_html_report(cv_table, test_table, cv_p_values, test_p_values, fig1_base64, fig2_base64):
    """Generate comprehensive HTML report"""
    html = f"""
    <html>
    <head>
        <title>Model Comparison Report: Text Features Analysis</title>
        <style>
            body {{
                font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                margin: 20px;
                line-height: 1.6;
                color: #333;
            }}
            h1 {{
                color: #2c3e50;
                border-bottom: 2px solid #3498db;
                padding-bottom: 10px;
                text-align: center;
            }}
            h2 {{
                color: #2980b9;
                margin-top: 40px;
                border-left: 4px solid #3498db;
                padding-left: 10px;
            }}
            h3 {{
                color: #16a085;
                margin-top: 25px;
            }}
            .table-container {{
                margin: 20px 0;
                overflow-x: auto;
            }}
            table {{
                border-collapse: collapse;
                width: 100%;
                margin-bottom: 25px;
                box-shadow: 0 1px 3px rgba(0,0,0,0.1);
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 12px;
                text-align: left;
            }}
            th {{
                background-color: #3498db;
                color: white;
                font-weight: bold;
            }}
            tr:nth-child(even) {{
                background-color: #f8f9fa;
            }}
            tr:hover {{
                background-color: #f1f1f1;
            }}
            .significance {{
                color: #e74c3c;
                font-weight: bold;
            }}
            .section {{
                margin-bottom: 40px;
                background-color: #f9f9f9;
                padding: 20px;
                border-radius: 5px;
                box-shadow: 0 2px 4px rgba(0,0,0,0.05);
            }}
            .metric-title {{
                font-weight: bold;
                color: #2c3e50;
            }}
            .model-name {{
                font-weight: bold;
            }}
            .baseline {{
                color: #3498db;
            }}
            .all-text {{
                color: #e74c3c;
            }}
            .neg-text {{
                color: #27ae60;
            }}
            .table-caption {{
                font-weight: bold;
                margin-bottom: 10px;
                text-align: left;
                color: #2c3e50;
            }}
            .figure-container {{
                text-align: center;
                margin: 30px 0;
            }}
            .figure-caption {{
                font-weight: bold;
                margin-top: 10px;
                text-align: center;
                color: #2c3e50;
            }}
            .interpretation {{
                background-color: #e8f4fc;
                padding: 15px;
                border-radius: 5px;
                margin: 15px 0;
            }}
            .note {{
                font-style: italic;
                color: #7f8c8d;
                font-size: 0.9em;
            }}
        </style>
    </head>
    <body>
        <h1>Model Comparison Report: Impact of Text Features</h1>

        <div class="section">
            <h2>1. Cross-Validation Results</h2>

            <div class="table-container">
                <div class="table-caption">Table 1: Cross-Validation Metrics</div>
                <p class="note">Note: RMSE values are shown in millions of RUB (₽)</p>
                {cv_table.to_html(index=False, escape=False, classes="metric-table")}
            </div>

            <div class="table-container">
                <div class="table-caption">Table 2: Statistical Significance (p-values)</div>
                {cv_p_values.to_html(index=False, float_format="%.4f", classes="p-value-table")}
            </div>

            <div class="figure-container">
                <img src="data:image/png;base64,{fig1_base64}" width="800">
                <div class="figure-caption">Figure 1: Cross-Validation Metrics Comparison</div>
                <p class="note">Note: RMSE values are shown in millions of RUB (₽)</p>
            </div>
        </div>

        <div class="section">
            <h2>2. Test Set Results</h2>

            <div class="table-container">
                <div class="table-caption">Table 3: Test Set Metrics</div>
                <p class="note">Note: RMSE values are shown in millions of RUB (₽)</p>
                {test_table.to_html(index=False, escape=False, classes="metric-table")}
            </div>

            <div class="table-container">
                <div class="table-caption">Table 4: Statistical Significance (p-values)</div>
                {test_p_values.to_html(index=False, float_format="%.4f", classes="p-value-table")}
            </div>

            <div class="figure-container">
                <img src="data:image/png;base64,{fig2_base64}" width="800">
                <div class="figure-caption">Figure 2: Test Set Metrics Comparison</div>
                <p class="note">Note: RMSE values are shown in millions of RUB (₽)</p>
            </div>
        </div>

        <div class="section">
            <h2>3. Interpretation Guide</h2>

            <div class="interpretation">
                <h3>Statistical Significance Indicators:</h3>
                <ul>
                    <li><span class="significance">*</span> p-value &lt; 0.05 (significant)</li>
                    <li><span class="significance">**</span> p-value &lt; 0.01 (very significant)</li>
                    <li><span class="significance">***</span> p-value &lt; 0.001 (highly significant)</li>
                </ul>
            </div>

            <h3>Model Descriptions:</h3>
            <ul>
                <li><span class="model-name baseline">Baseline</span>: Model without any text features</li>
                <li><span class="model-name all-text">All Text Features</span>: Model incorporating all available text features</li>
                <li><span class="model-name neg-text">Negative Text Only</span>: Model using only negative sentiment text features</li>
            </ul>

            <h3>Performance Metrics Explained:</h3>
            <ul>
                <li><span class="metric-title">R² Score</span>: Measures the proportion of variance explained (0-1, higher is better)</li>
                <li><span class="metric-title">RMSE (million RUB)</span>: Root Mean Square Error - absolute measure of prediction errors in millions of rubles (lower is better)</li>
                <li><span class="metric-title">SMAPE (%)</span>: Symmetric Mean Absolute Percentage Error - percentage accuracy measure (lower is better)</li>
                <li><span class="metric-title">MedAPE (%)</span>: Median Absolute Percentage Error - robust percentage accuracy measure (lower is better)</li>
            </ul>

            <div class="interpretation">
                <h3>Key Findings:</h3>
                <p>The analysis compares the impact of different text feature configurations on model performance.
                Statistical significance markers indicate whether the differences from the baseline model are meaningful.</p>
                <p>All monetary values (RMSE) are presented in millions of Russian rubles (₽) for better readability.</p>
            </div>
        </div>
    </body>
    </html>
    """
    return html

# =====================
# MAIN EXECUTION
# =====================

# Load and verify data
print("Loading data...")
for model_name in models:
    print(f"\nProcessing {model_name} model:")
    for file_type in ['summary', 'cv', 'predictions']:
        file_path = models[model_name][file_type]
        if not os.path.exists(file_path):
            print(f"Warning: File not found - {file_path}")
        else:
            print(f"Found {file_type} data")

    try:
        models[model_name]['summary_metrics'] = load_summary_metrics(models[model_name]['summary'])
        models[model_name]['cv_metrics'] = load_cv_metrics(models[model_name]['cv'])
        models[model_name]['predictions'] = load_predictions(models[model_name]['predictions'])
        print("Data loaded successfully")
    except Exception as e:
        print(f"Error loading data: {str(e)}")

# Metric configuration
metric_names = ['r2', 'rmse', 'smape', 'medape']

print("\nRunning analysis...")
# Cross-validation analysis
cv_comparison, cv_p_values = add_statistical_significance(
    create_comparison_table(models, metric_names, test_data=False),
    models, metric_names, test_data=False
)

# Test set analysis
test_comparison, test_p_values = add_statistical_significance(
    create_comparison_table(models, metric_names, test_data=True),
    models, metric_names, test_data=True
)

print("\nGenerating visualizations...")
# Generate visualizations and get base64 strings
fig1_base64 = plot_metric_comparison(cv_comparison, cv_p_values, "Cross-Validation Metrics Comparison", 1)
fig2_base64 = plot_metric_comparison(test_comparison, test_p_values, "Test Set Metrics Comparison", 2)

print("Creating HTML report...")
# Create and display HTML report
html_report = create_html_report(cv_comparison, test_comparison, cv_p_values, test_p_values, fig1_base64, fig2_base64)
with open('model_comparison_report.html', 'w') as f:
    f.write(html_report)

print("\nAnalysis complete! Displaying results...")
display(HTML(html_report))

#