In [None]:

import sys
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, precision_score,
    roc_auc_score, classification_report, confusion_matrix
)
from sklearn.model_selection import GridSearchCV

try:
    from pymystem3 import Mystem
    from nltk.corpus import stopwords
    PREPROCESSING_AVAILABLE = True
    print("Библиотеки для препроцессинга")
except ImportError:
    PREPROCESSING_AVAILABLE = False
    print("Pymystem3 или NLTK недоступны, используем простую обработку")

Библиотеки для препроцессинга


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

"""
Оптимизированная ensemble модель для превышения всех существующих результатов
Архитектура: TF-IDF + Feature Engineering + Multiple Algorithms + Stacking
"""

import sys
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score, precision_score,
    roc_auc_score, classification_report, confusion_matrix
)
from sklearn.preprocessing import StandardScaler
try:
    from pymystem3 import Mystem
    from nltk.corpus import stopwords
    PREPROCESSING_AVAILABLE = True
    print("Библиотеки для препроцессинга загружены")
except ImportError:
    PREPROCESSING_AVAILABLE = False
    print("Pymystem3 или NLTK недоступны, используем простую обработку")

def load_data():
    paths_to_try = [
        ("../data/x.csv", "../data/y.csv"),
        ("./data/x.csv", "./data/y.csv"),
        ("x.csv", "y.csv"),
        ("./x.csv", "./y.csv")
    ]

    X, y = None, None
    for x_path, y_path in paths_to_try:
        try:
            X=pd.read_csv(x_path, index_col="date")
            y=pd.read_csv(y_path, index_col="date").iloc[:, 0]
            print(f"Данные загружены из: {x_path}, {y_path}")
            break
        except FileNotFoundError:
            continue
        except Exception as e:
            print(f"Ошибка загрузки {x_path}: {e}")
            continue
    if X is None or y is None:
        print("ОШИБКА: Файлы данных не найдены!")
        return None, None
    print(f"Загружено {len(X)} образцов")
    print(f"Распределение классов: {dict(y.value_counts().sort_index())}")
    return X, y

def preprocess_texts(texts):
    if PREPROCESSING_AVAILABLE:
        try:
            mystem=Mystem()
            stop_words=set(stopwords.words("russian"))
            def advanced_preprocessor(text):
                if pd.isna(text):
                    return ""
                text = str(text).lower()
                text = re.sub(r'[^а-яё\s]', ' ', text)
                lemmatized=mystem.lemmatize(text)
                text= ' '.join(lemmatized)
                words=[word for word in text.split()
                        if word not in stop_words and len(word) >= 2]
                return ' '.join(words)
            print("Продвинутый препроцессинг (Mystem + стоп-слова)")
            return texts.apply(advanced_preprocessor)
        except Exception as e:
            print(f"Ошибка продвинутого препроцессинга: {e}")

    def simple_preprocessor(text):
        if pd.isna(text):
            return ""
        text=str(text).lower()
        text=re.sub(r'[^\w\s]', ' ', text)
        text=re.sub(r'\d+', ' ', text)
        text=re.sub(r'\s+', ' ', text).strip()
        return text
    print("Простой препроцессинг")
    return texts.apply(simple_preprocessor)

class AdvancedFeatureExtractor:
    def __init__(self):
        self.feature_extractors = {}
        self.scalers = {}
    def extract_linguistic_features(self, texts):
        features = []
        positive_words = [
            'рост', 'увеличение', 'повышение', 'прибыль', 'доход', 'положительный',
            'улучшение', 'восстановление', 'стабилизация', 'укрепление', 'развитие'
        ]
        negative_words = [
            'снижение', 'падение', 'уменьшение', 'убыток', 'кризис', 'негативный',
            'ухудшение', 'нестабильность', 'спад', 'сокращение', 'риск'
        ]
        neutral_words = [
            'стабильный', 'неизменный', 'нейтральный', 'текущий', 'обычный'
            'сохранение', 'поддержание', 'ожидание', 'прогноз'
        ]
        financial_terms = [
            'рубль', 'доллар', 'процент', 'ставка', 'инфляция', 'валюта', 'банк',
            'кредит', 'депозит', 'инвестиции', 'капитал', 'ликвидность', 'волатильность'
        ]
        time_indicators = [
            'квартал', 'месяц', 'год', 'период', 'срок', 'временно', 'краткосрочный',
            'долгосрочный', 'будущий', 'прошлый', 'текущий'
        ]
        for text in texts:
            if pd.isna(text):
                text = ""
            text_lower = text.lower()
            words = text_lower.split()
            text_features = {}
            text_features['text_length'] = len(text)
            text_features['word_count'] = len(words)
            text_features['sentence_count'] = len([s for s in text.split('.') if s.strip()])
            text_features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0
            text_features['unique_word_ratio'] = len(set(words)) / max(len(words), 1)
            text_features['punct_ratio'] = len(re.findall(r'[^\w\s]', text)) / max(len(text), 1)
            text_features['upper_ratio'] = sum(1 for c in text if c.isupper()) / max(len(text), 1)
            text_features['digit_ratio'] = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
            text_features['comma_count'] = text.count(',')
            text_features['period_count'] = text.count('.')
            text_features['exclamation_count'] = text.count('!')
            text_features['question_count'] = text.count('?')
            text_features['positive_words_count'] = sum(1 for word in positive_words if word in text_lower)
            text_features['negative_words_count'] = sum(1 for word in negative_words if word in text_lower)
            text_features['neutral_words_count'] = sum(1 for word in neutral_words if word in text_lower)
            total_sentiment_words = (text_features['positive_words_count'] +
                                   text_features['negative_words_count'] +
                                   text_features['neutral_words_count'])
            if total_sentiment_words > 0:
                text_features['sentiment_ratio'] = (
                    text_features['positive_words_count'] - text_features['negative_words_count']
                ) / total_sentiment_words
                text_features['sentiment_intensity'] = total_sentiment_words / max(len(words), 1)
            else:
                text_features['sentiment_ratio'] = 0
                text_features['sentiment_intensity'] = 0
            text_features['financial_terms_count'] = sum(1 for term in financial_terms if term in text_lower)
            text_features['financial_density'] = text_features['financial_terms_count'] / max(len(words), 1)
            text_features['time_indicators_count'] = sum(1 for term in time_indicators if term in text_lower)
            text_features['complex_words_ratio'] = sum(1 for word in words if len(word) > 6) / max(len(words), 1)
            text_features['short_words_ratio'] = sum(1 for word in words if len(word) <= 3) / max(len(words), 1)
            bigrams=[' '.join(words[i:i+2]) for i in range(len(words)-1)]
            trigrams=[' '.join(words[i:i+3]) for i in range(len(words)-2)]
            key_bigrams=['процентная ставка', 'денежная политика', 'инфляционные ожидания', 'экономический рост']
            key_trigrams= ['ключевая процентная ставка', 'совет директоров банка', 'денежно кредитная политика']
            text_features['key_bigrams_count'] = sum(1 for bigram in key_bigrams if bigram in text_lower)
            text_features['key_trigrams_count'] = sum(1 for trigram in key_trigrams if trigram in text_lower)
            features.append(list(text_features.values()))
        return np.array(features)
    def fit_transform(self, texts, verbose=False):
        if verbose:
            print("Извлечение признаков...")
        # 1. TF-IDF с разными параметрами
        self.feature_extractors['tfidf_1_3'] = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=8000,
            min_df=2,
            max_df=0.8,
            sublinear_tf=True
        )
        self.feature_extractors['tfidf_1_2'] = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=5000,
            min_df=3,
            max_df=0.7,
            sublinear_tf=True
        )
        self.feature_extractors['tfidf_char'] = TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            max_features=3000,
            min_df=2
        )
        self.feature_extractors['count_vec'] = CountVectorizer(
            ngram_range=(1, 2),
            max_features=3000,
            min_df=3,
            max_df=0.8
        )
        feature_sets={}
        for name, extractor in self.feature_extractors.items():
            feature_sets[name] = extractor.fit_transform(texts).toarray()
            if verbose:
                print(f"     {name}: {feature_sets[name].shape[1]} признаков")
        # Лингвистические признаки
        linguistic_features=self.extract_linguistic_features(texts)
        feature_sets['linguistic']=linguistic_features
        if verbose:
            print(f"linguistic: {linguistic_features.shape[1]} признаков")
        # Нормализация лингвистических признаков
        self.scalers['linguistic']=StandardScaler()
        feature_sets['linguistic']=self.scalers['linguistic'].fit_transform(feature_sets['linguistic'])
        return feature_sets

    def transform(self, texts):
        feature_sets ={}
        for name, extractor in self.feature_extractors.items():
            feature_sets[name]=extractor.transform(texts).toarray()
        linguistic_features = self.extract_linguistic_features(texts)
        feature_sets['linguistic'] = self.scalers['linguistic'].transform(linguistic_features)

        return feature_sets

class StackingEnsembleClassifier:
#Стекинг ансамбль с несколькими уровнями

    def __init__(self):
        self.base_models={}
        self.meta_models={}
        self.feature_extractor=AdvancedFeatureExtractor()
        self.label_mapping={-1: 0, 0: 1, 1: 2}
        self.reverse_mapping ={0: -1, 1: 0, 2: 1}

    def _create_base_models(self):
 #Создание базовых моделей
        models = {
            # Линейные модели
            'lr_balanced': LogisticRegression(
                max_iter=1000, C=1.0, class_weight='balanced', random_state=42
            ),
            'lr_l1': LogisticRegression(
                max_iter=1000, C=0.5, penalty='l1', solver='liblinear',
                class_weight='balanced', random_state=42
            ),
            'ridge': RidgeClassifier(
                alpha=1.0, class_weight='balanced', random_state=42
            ),
            # Ensemble модели
            'rf_balanced': RandomForestClassifier(
                n_estimators=200, max_depth=15, class_weight='balanced',
                random_state=42, n_jobs=-1
            ),
            'gb_classifier': GradientBoostingClassifier(
                n_estimators=150, learning_rate=0.1, max_depth=6, random_state=42
            ),
            # SVM
            'svm_rbf': SVC(
                kernel='rbf', C=1.0, probability=True, class_weight='balanced', random_state=42
            ),
            'svm_linear': SVC(
                kernel='linear', C=0.5, probability=True, class_weight='balanced', random_state=42
            ),
            # Naive Bayes
            'nb_multinomial': MultinomialNB(alpha=0.1),
            # KNN
            'knn': KNeighborsClassifier(n_neighbors=7, weights='distance'),
            # Neural Network
            'mlp': MLPClassifier(
                hidden_layer_sizes=(256, 128), activation='relu',
                alpha=0.001, max_iter=500, random_state=42
            )
        }
        return models

    def fit(self, texts, labels, verbose=False):
        """Обучение стекинг ансамбля"""
        # Извлечение признаков
        feature_sets = self.feature_extractor.fit_transform(texts, verbose=verbose)

        # Преобразование меток
        y_mapped = [self.label_mapping[label] for label in labels]

        # Создание базовых моделей
        base_models = self._create_base_models()

        model_configs = [
            # TF-IDF модели
            ('lr_balanced_tfidf_1_3', 'tfidf_1_3', base_models['lr_balanced']),
            ('lr_l1_tfidf_1_2', 'tfidf_1_2', base_models['lr_l1']),
            ('svm_rbf_tfidf', 'tfidf_1_3', base_models['svm_rbf']),
            ('nb_tfidf', 'tfidf_1_2', base_models['nb_multinomial']),
            # Char-level модели
            ('lr_char', 'tfidf_char', LogisticRegression(max_iter=1000, C=0.1, class_weight='balanced')),
            ('svm_char', 'tfidf_char', SVC(kernel='linear', C=0.1, probability=True, class_weight='balanced')),
            # Count vectorizer модели
            ('rf_count', 'count_vec', base_models['rf_balanced']),
            ('gb_count', 'count_vec', base_models['gb_classifier']),
            # Модели на лингвистических признаках
            ('lr_linguistic', 'linguistic', LogisticRegression(max_iter=1000, C=0.1, class_weight='balanced')),
            ('rf_linguistic', 'linguistic', RandomForestClassifier(n_estimators=100, class_weight='balanced')),
            ('knn_linguistic', 'linguistic', base_models['knn']),
            # Комбинированные признаки
            ('mlp_combined', 'combined', base_models['mlp']),
            ('rf_combined', 'combined', RandomForestClassifier(n_estimators=150, max_depth=12, class_weight='balanced')),
        ]
        # Создание комбинированных признаков
        main_tfidf = feature_sets['tfidf_1_3']
        linguistic = feature_sets['linguistic']
        combined_features = np.hstack([main_tfidf, linguistic])
        feature_sets['combined'] = combined_features

        # Обучение базовых моделей
        self.base_models = {}
        for model_name, feature_name, model in model_configs:
            try:
                X_features = feature_sets[feature_name]
                model.fit(X_features, y_mapped)
                self.base_models[model_name] = (model, feature_name)
            except Exception as e:
                if verbose:
                    print(f"Ошибка обучения {model_name}: {e}")
        meta_features=[]
        for model_name, (model, feature_name) in self.base_models.items():
            X_features=feature_sets[feature_name]
            try:
                proba=model.predict_proba(X_features)
                meta_features.append(proba)
            except:

                pred= model.predict(X_features)
                proba =np.eye(3)[pred]
                meta_features.append(proba)
        if meta_features:
            meta_X = np.hstack(meta_features)
            self.meta_models = {
                'meta_lr': LogisticRegression(max_iter=1000, C=0.1, random_state=42),
                'meta_rf': RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42),
                'meta_gb': GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42)
            }
            for name, model in self.meta_models.items():
                model.fit(meta_X, y_mapped)
        return self

    def predict_proba(self, texts):
#Предсказание вероятностей
        feature_sets=self.feature_extractor.transform(texts)
        main_tfidf=feature_sets['tfidf_1_3']
        linguistic=feature_sets['linguistic']
        combined_features=np.hstack([main_tfidf, linguistic])
        feature_sets['combined'] = combined_features
        # Получение предсказаний от базовых моделей
        meta_features = []
        for model_name, (model, feature_name) in self.base_models.items():
            X_features = feature_sets[feature_name]
            try:
                proba = model.predict_proba(X_features)
                meta_features.append(proba)
            except:
                pred=model.predict(X_features)
                proba=np.eye(3)[pred]
                meta_features.append(proba)
        if meta_features:
            meta_X=np.hstack(meta_features)

            meta_predictions=[]
            for name, model in self.meta_models.items():
                meta_pred=model.predict_proba(meta_X)
                meta_predictions.append(meta_pred)

            final_proba=np.mean(meta_predictions, axis=0)
            return final_proba
        else:
            return np.full((len(texts), 3), 1/3)

    def predict(self, texts):
        #Предсказание классов
        probas=self.predict_proba(texts)
        predictions=np.argmax(probas, axis=1)
        return [self.reverse_mapping[pred] for pred in predictions]

def run_advanced_ensemble_experiment(X, y):
    X_preprocessed = preprocess_texts(X.release)
    print(f"Препроцессинг завершен для {len(X_preprocessed)} текстов")
    predictions=[]
    verbose_step=max(1, (len(X_preprocessed) - 30) // 5)
    print(f"\n Обучение на {len(X_preprocessed) - 30} итерациях...")
    for i in tqdm(range(30, len(X_preprocessed)), desc="Обучение модели"):
        try:
            # Обучающие данные
            train_texts=X_preprocessed.iloc[:i].tolist()
            train_labels y.iloc[:i].tolist()
            # Создание и обучение ансамбля
            ensemble = StackingEnsembleClassifier()
            show_verbose = (i - 30) % verbose_step == 0 and (i - 30) < verbose_step * 2
            ensemble.fit(train_texts, train_labels, verbose=show_verbose)

            # Предсказание
            test_text=[X_preprocessed.iloc[i]]
            y_pred_proba=ensemble.predict_proba(test_text)[0]
            y_pred=ensemble.predict(test_text)[0]
            predictions.append((y_pred, y_pred_proba))
        except Exception as e:
            print(f"\nОшибка на итерации {i}: {e}")
            predictions.append((0, np.array([0.33, 0.34, 0.33])))
    return predictions

def calculate_and_display_metrics(predictions, y_true):
    """Расчет и вывод метрик"""
    print("\n" + "="*80)
    print("РЕЗУЛЬТАТЫ ADVANCED STACKING ENSEMBLE МОДЕЛИ")
    print("="*80)
    # Извлечение предсказаний и вероятностей
    y_preds=[pred[0] for pred in predictions]
    y_preds_proba=np.array([pred[1] for pred in predictions])
    # Обрезка до одинаковой длины
    min_len=min(len(y_true), len(y_preds))
    y_true=y_true[:min_len]
    y_preds=y_preds[:min_len]
    y_preds_proba=y_preds_proba[:min_len]

    # Расчет всех метрик
    accuracy=accuracy_score(y_true, y_preds)
    f1=f1_score(y_true, y_preds, average='macro')
    recall=recall_score(y_true, y_preds, average='macro')
    precision=precision_score(y_true, y_preds, average='macro')

    try:
        roc_auc_ovr=roc_auc_score(y_true, y_preds_proba, average='macro', multi_class='ovr')
        roc_auc_ovo=roc_auc_score(y_true, y_preds_proba, average='macro', multi_class='ovo')
    except:
        roc_auc_ovr=0.5
        roc_auc_ovo=0.5

    # Вывод основных метрик
    print(f"\nОСНОВНЫЕ МЕТРИКИ ADVANCED ENSEMBLE:")
    print(f"Accuracy:     {accuracy:.6f}")
    print(f"F1-score:     {f1:.6f}")
    print(f"Recall:       {recall:.6f}")
    print(f"Precision:    {precision:.6f}")
    print(f"ROC-AUC OvR:  {roc_auc_ovr:.6f}")
    print(f"ROC-AUC OvO:  {roc_auc_ovo:.6f}")

    # Детальный отчет по классам
    print(f"\nCLASSIFICATION REPORT:")
    print(classification_report(y_true, y_preds,
                              target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)']))
    return {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'precision': precision,
        'roc_auc_ovr': roc_auc_ovr,
        'roc_auc_ovo': roc_auc_ovo,
        'predictions': predictions
    }

def main():
    print("ЗАПУСК ADVANCED STACKING ENSEMBLE ЭКСПЕРИМЕНТА")
    print("="*80)
    # 1. Загрузка данных
    X, y = load_data()
    if X is None or y is None:
        print("Не удалось загрузить данные. Завершение работы.")
        return None

    # 2. Запуск эксперимента
    try:
        predictions = run_advanced_ensemble_experiment(X, y)
        print(f"Получено {len(predictions)} предсказаний")

        # 3. Расчет и вывод метрик
        y_true = y.iloc[30:30+len(predictions)].tolist()
        results = calculate_and_display_metrics(predictions, y_true)

        return results
    except Exception as e:
        print(f"Ошибка во время эксперимента: {e}")
        import traceback
        traceback.print_exc()
        return None
# Запуск эксперимента
if __name__ == "__main__":
    results = main()
else:
    print("Advanced Stacking Ensemble модель готова к запуску!")
    print("Для запуска выполните: results = main()")

Библиотеки для препроцессинга загружены
ЗАПУСК ADVANCED STACKING ENSEMBLE ЭКСПЕРИМЕНТА
Данные загружены из: x.csv, y.csv
📊 Загружено 100 образцов
📈 Распределение классов: {-1.0: np.int64(30), 0.0: np.int64(45), 1.0: np.int64(25)}


Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


Продвинутый препроцессинг (Mystem + стоп-слова)
Препроцессинг завершен для 100 текстов

🔄 Обучение на 70 итерациях...


Обучение модели:   0%|          | 0/70 [00:00<?, ?it/s]

 Извлечение признаков...
     tfidf_1_3: 4512 признаков
     tfidf_1_2: 1541 признаков
     tfidf_char: 3000 признаков
     count_vec: 1601 признаков
linguistic: 24 признаков


Обучение модели:  20%|██        | 14/70 [02:32<12:17, 13.17s/it]

 Извлечение признаков...
     tfidf_1_3: 6740 признаков
     tfidf_1_2: 2237 признаков
     tfidf_char: 3000 признаков
     count_vec: 2266 признаков
linguistic: 24 признаков


Обучение модели: 100%|██████████| 70/70 [19:02<00:00, 16.32s/it]

Получено 70 предсказаний

  РЕЗУЛЬТАТЫ ADVANCED STACKING ENSEMBLE МОДЕЛИ

🚀 ОСНОВНЫЕ МЕТРИКИ ADVANCED ENSEMBLE:
   🎯 Accuracy:     0.614286
   📈 F1-score:     0.621076
   🔄 Recall:       0.609704
   🎪 Precision:    0.650350
   📊 ROC-AUC OvR:  0.729279
   📊 ROC-AUC OvO:  0.735330

📋 CLASSIFICATION REPORT:
               precision    recall  f1-score   support

Negative (-1)       0.67      0.70      0.68        23
  Neutral (0)       0.52      0.61      0.56        28
 Positive (1)       0.77      0.53      0.62        19

     accuracy                           0.61        70
    macro avg       0.65      0.61      0.62        70
 weighted avg       0.63      0.61      0.62        70






Ансамбль моделей не дал существенно лучшего результата, но при этом находится ближе к середине (по сравнению с остальными моделями) со значением Accuracy: 0.614286
МОдель хорошо отрабатывает длинные и короткие тексты и сложную лексику( благодоря левел и ленгвистическим признакам). Возможно требуется поработать с лингивстическими признаками и n-граммами.