In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from scipy import stats
import warnings
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import torch
warnings.filterwarnings('ignore')

class AdvancedFinancialPreprocessor:
    """
    Advanced pipeline for financial data preprocessing for multi-ticker forecasting.
    """
    
    def __init__(self, target_periods=list(range(1, 21)), stationarity_alpha=0.05, 
                 use_news=True, embedding_dim=32):
        self.target_periods = target_periods
        self.stationarity_alpha = stationarity_alpha
        self.state = {}
        self.stationary_info = {}
        self.stationary_params = {}
        self.feature_columns = []
        self.is_fitted = False
        self.use_news = use_news
        self.embedding_dim = embedding_dim
        self.news_model = None
        self.pca = None
        self.scaler = StandardScaler()
        
    def _load_news_model(self):
        """Load news embedding model if not already loaded."""
        if self.news_model is None and self.use_news:
            try:
                print("🔄 Loading news embedding model...")
                self.news_model = SentenceTransformer("all-MiniLM-L6-v2")
                print("✅ News model loaded successfully!")
            except Exception as e:
                print(f"❌ Failed to load news model: {e}")
                self.use_news = False
    
    def _process_news_data(self, news_df, cache_path=None):
        """Process news data and generate compressed embeddings."""
        if not self.use_news or news_df is None:
            return {}
            
        self._load_news_model()
        
        print("📰 Processing news data with compressed embeddings...")
        
        news_df = news_df.copy()
        news_df['news_date'] = pd.to_datetime(news_df['publish_date']).dt.strftime('%Y-%m-%d')
        news_df['combined_text'] = news_df['title'] + ". " + news_df['publication'].fillna('')
        
        daily_news = news_df.groupby('news_date')['combined_text'].apply(
            lambda x: ' '.join(x.dropna())
        ).reset_index()
        
        embeddings_by_date = {}
        all_embeddings = []
        dates_list = []
        
        for idx, row in daily_news.iterrows():
            date = row['news_date']
            text = row['combined_text']
            
            if not text or len(text.strip()) < 10:
                continue
                
            try:
                embedding = self.news_model.encode(
                    text,
                    batch_size=32,
                    show_progress_bar=False,
                    normalize_embeddings=True
                )
                embeddings_by_date[date] = embedding
                all_embeddings.append(embedding)
                dates_list.append(date)
                    
            except Exception as e:
                print(f"❌ Error encoding news for date {date}: {e}")
                continue
        
        if all_embeddings and len(all_embeddings) > self.embedding_dim:
            print(f"🔧 Compressing embeddings from {all_embeddings[0].shape[0]} to {self.embedding_dim} dimensions...")
            all_embeddings_array = np.array(all_embeddings)
            
            if self.pca is None:
                self.pca = PCA(n_components=self.embedding_dim)
                compressed_embeddings = self.pca.fit_transform(all_embeddings_array)
            else:
                compressed_embeddings = self.pca.transform(all_embeddings_array)
            
            for i, date in enumerate(dates_list):
                embeddings_by_date[date] = compressed_embeddings[i]
        
        print(f"✅ Generated compressed embeddings for {len(embeddings_by_date)} news dates")
        return embeddings_by_date
    
    def _add_news_embeddings(self, df, news_embeddings):
        """Add compressed news embeddings to the dataframe."""
        if not self.use_news or not news_embeddings:
            return df
            
        print("🔗 Adding compressed news embeddings to dataset...")
        
        df = df.copy()
        df['date_str'] = pd.to_datetime(df['begin']).dt.strftime('%Y-%m-%d')
        
        first_embedding = next(iter(news_embeddings.values()))
        embedding_dim = len(first_embedding)
        
        news_columns = [f'news_emb_{i}' for i in range(embedding_dim)]
        
        for col in news_columns:
            df[col] = 0.0
        
        embeddings_added = 0
        for date_str, embedding in news_embeddings.items():
            mask = df['date_str'] == date_str
            if mask.any():
                for i, value in enumerate(embedding):
                    df.loc[mask, f'news_emb_{i}'] = value
                embeddings_added += mask.sum()
        
        df = df.drop(columns=['date_str'])
        
        print(f"✅ Added {embedding_dim} compressed news embeddings to {embeddings_added} rows")
        return df

    def _create_complete_calendar(self, df, ticker):
        """Create complete calendar with forward fill for missing dates."""
        df = df.copy()
        df['date'] = pd.to_datetime(df['begin'])
        
        start_date = df['date'].min()
        end_date = df['date'].max()
        full_calendar = pd.date_range(start=start_date, end=end_date, freq='D')
        
        calendar_df = pd.DataFrame({'date': full_calendar})
        calendar_df['ticker'] = ticker
        calendar_df['begin'] = calendar_df['date'].dt.strftime('%Y-%m-%d')
        
        df_for_merge = df.drop(columns=['date'], errors='ignore')
        
        merged_df = calendar_df.merge(
            df_for_merge, on=['begin', 'ticker'], how='left', suffixes=('_calendar', '_data')
        )
        
        price_columns = ['open', 'high', 'low', 'close', 'volume']
        for col in price_columns:
            if col in merged_df.columns:
                merged_df[col] = merged_df[col].fillna(method='ffill')
        
        for col in price_columns:
            if col in merged_df.columns:
                merged_df[col] = merged_df[col].fillna(method='bfill')
        
        merged_df['is_original'] = ~merged_df['close'].isna()
        
        return merged_df

    def _calculate_returns_targets(self, df, ticker):
        """Calculate future returns targets: r_(t+N) = close_(t+N)/close_t - 1."""
        df = df.copy()
        
        original_days = df[df['is_original'] == True].copy()
        original_days = original_days.sort_values('begin')
        
        if len(original_days) == 0:
            print(f"⚠️ No original days found for ticker {ticker}")
            return pd.DataFrame()
        
        dates_list = original_days['begin'].tolist()
        close_prices = original_days['close'].tolist()
        
        targets_data = []
        
        for idx, current_date in enumerate(dates_list):
            row_targets = {'begin': current_date, 'ticker': ticker}
            current_close = close_prices[idx]
            
            for N in self.target_periods:
                future_idx = idx + N
                if future_idx < len(dates_list):
                    future_close = close_prices[future_idx]
                    return_value = (future_close / current_close) - 1
                    row_targets[f'return_{N}d'] = return_value
                else:
                    row_targets[f'return_{N}d'] = np.nan
            
            targets_data.append(row_targets)
        
        targets_df = pd.DataFrame(targets_data)
        targets_df = targets_df.drop_duplicates(subset=['begin', 'ticker'], keep='last')
        return targets_df

    def _check_stationarity(self, series, max_lags=None):
        """Enhanced stationarity check handling all non-finite values."""
        series_array = np.asarray(series)
        finite_mask = np.isfinite(series_array)
        
        if np.sum(finite_mask) < 10:
            return False, 1.0
        
        series_clean = series_array[finite_mask]
        
        try:
            adf_result = adfuller(series_clean, maxlag=max_lags, autolag='AIC')
            p_value = adf_result[1]
            return p_value < self.stationarity_alpha, p_value
        except Exception as e:
            print(f"ADF test failed: {e}")
            return False, 1.0

    def _apply_returns_stationarity_correction(self, df, ticker):
        """Apply stationarity correction to returns targets using safe methods."""
        df = df.copy()
        stationary_targets = {}
        
        for N in self.target_periods:
            target_col = f'return_{N}d'
            if target_col not in df.columns:
                continue
                
            original_mask = (df['is_original'] == True) & (df[target_col].notna())
            original_series = df.loc[original_mask, target_col]
            
            if len(original_series) < 10:
                print(f"⚠️ Not enough data for {target_col}: {len(original_series)} points")
                df[f'stationary_return_{N}d'] = df[target_col]
                stationary_targets[target_col] = ('none', 1.0)
                continue
            
            is_stationary, p_value = self._check_stationarity(original_series)
            
            if not is_stationary:
                print(f"🔄 Applying stationarity correction for {target_col} (p-value: {p_value:.4f})")
                
                log_stationary, log_pvalue = False, 1.0
                log_series = None
                if (original_series > -1).all():
                    log_series = np.log1p(original_series)
                    log_stationary, log_pvalue = self._check_stationarity(log_series)
                
                diff_series = original_series.diff().dropna()
                diff_stationary, diff_pvalue = self._check_stationarity(diff_series) if len(diff_series) > 0 else (False, 1.0)
                
                methods = [
                    ('log', log_stationary, log_pvalue, log_series),
                    ('diff', diff_stationary, diff_pvalue, diff_series),
                ]
                
                valid_methods = [(name, stat, pval, data) for name, stat, pval, data in methods 
                               if data is not None and len(data) > 0 and stat]
                
                if valid_methods:
                    best_method = min(valid_methods, key=lambda x: x[2])
                    method_name, _, best_pvalue, best_data = best_method
                    
                    if method_name == 'log':
                        df[f'stationary_return_{N}d'] = np.log1p(df[target_col])
                        self.stationary_params[f'{ticker}_return_{N}d'] = {
                            'method': 'log',
                            'original_mean': original_series.mean(),
                            'original_std': original_series.std()
                        }
                    elif method_name == 'diff':
                        df[f'stationary_return_{N}d'] = df[target_col].diff()
                        first_value = original_series.iloc[0] if len(original_series) > 0 else 0
                        self.stationary_params[f'{ticker}_return_{N}d'] = {
                            'method': 'diff',
                            'first_value': first_value
                        }
                    
                    stationary_targets[target_col] = (method_name, best_pvalue)
                    print(f"✅ {ticker}, {target_col}: {method_name} applied (p-value: {best_pvalue:.4f})")
                else:
                    stationary_targets[target_col] = ('none', p_value)
                    df[f'stationary_return_{N}d'] = df[target_col]
                    self.stationary_params[f'{ticker}_return_{N}d'] = {'method': 'none'}
                    print(f"❌ {ticker}, {target_col}: No effective transformation found (p-value: {p_value:.4f})")
            else:
                stationary_targets[target_col] = ('none', p_value)
                df[f'stationary_return_{N}d'] = df[target_col]
                self.stationary_params[f'{ticker}_return_{N}d'] = {'method': 'none'}
                print(f"✅ {ticker}, {target_col}: Already stationary (p-value: {p_value:.4f})")
        
        return df, stationary_targets

    def _inverse_transform_predictions(self, predictions, ticker, horizon):
        """Inverse transform predictions from stationary space back to returns."""
        key = f'{ticker}_return_{horizon}d'
        
        if key not in self.stationary_params:
            return predictions
        
        params = self.stationary_params[key]
        method = params.get('method', 'none')
        
        if method == 'log':
            return np.exp(predictions) - 1
        elif method == 'diff':
            print(f"⚠️ Differencing inverse transform requires last value - returning as is for {ticker} {horizon}d")
            return predictions
        else:
            return predictions

    def _create_leak_proof_features(self, df):
        """Create features using ONLY historical data (no future leakage)."""
        df = df.copy()
        
        required_columns = ['open', 'high', 'low', 'close', 'volume']
        if not all(col in df.columns for col in required_columns):
            print("⚠️ Missing required price columns")
            return df
        
        # Basic price transformations
        df['hl_range'] = (df['high'] - df['low']) / df['close']
        df['oc_range'] = (df['close'] - df['open']) / df['open']
        df['price_change'] = df['close'].pct_change()
        
        # Moving averages (historical only)
        for window in [5, 10, 20]:
            df[f'sma_{window}'] = df['close'].rolling(window, min_periods=1).mean()
            df[f'ema_{window}'] = df['close'].ewm(span=window, min_periods=1).mean()
        
        # Volatility (historical)
        for window in [5, 10, 20]:
            df[f'volatility_{window}'] = df['price_change'].rolling(window, min_periods=1).std()
        
        # RSI (historical)
        def compute_rsi(series, window=14):
            delta = series.diff()
            gain = (delta.where(delta > 0, 0)).rolling(window, min_periods=1).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window, min_periods=1).mean()
            rs = gain / loss
            rsi = 100 - (100 / (1 + rs))
            return rsi.replace([np.inf, -np.inf], np.nan).fillna(50)
        
        df['rsi_14'] = compute_rsi(df['close'])
        
        # MACD (historical)
        exp1 = df['close'].ewm(span=12, min_periods=1).mean()
        exp2 = df['close'].ewm(span=26, min_periods=1).mean()
        df['macd'] = exp1 - exp2
        df['macd_signal'] = df['macd'].ewm(span=9, min_periods=1).mean()
        
        # Lag features (historical only)
        for lag in [1, 2, 3, 5, 7]:
            df[f'close_lag_{lag}'] = df['close'].shift(lag)
            df[f'volume_lag_{lag}'] = df['volume'].shift(lag)
            df[f'price_change_lag_{lag}'] = df['price_change'].shift(lag)
        
        # Rolling statistics
        for window in [5, 10]:
            df[f'close_rolling_mean_{window}'] = df['close'].rolling(window, min_periods=1).mean()
            df[f'close_rolling_std_{window}'] = df['close'].rolling(window, min_periods=1).std()
            df[f'volume_rolling_mean_{window}'] = df['volume'].rolling(window, min_periods=1).mean()
        
        # Temporal features
        if 'date' not in df.columns:
            df['date'] = pd.to_datetime(df['begin'])
        df['day_of_week'] = df['date'].dt.dayofweek
        df['month'] = df['date'].dt.month
        df['quarter'] = df['date'].dt.quarter
        df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
        df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
        
        # Fill any remaining NaN values
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        df[numeric_columns] = df[numeric_columns].fillna(method='bfill').fillna(method='ffill')
        
        return df

    def _clean_target_columns(self, df):
        """Remove original target columns, keep only stationary versions."""
        original_targets = [f'return_{N}d' for N in self.target_periods]
        columns_to_keep = [col for col in df.columns if col not in original_targets]
        return df[columns_to_keep]

    def _safe_merge(self, left, right, on, how='left'):
        """Safe merge with duplicate handling and validation."""
        left_clean = left.drop_duplicates(subset=on, keep='last')
        right_clean = right.drop_duplicates(subset=on, keep='last')
        
        try:
            merged = left_clean.merge(right_clean, on=on, how=how, validate='one_to_one')
            return merged
        except Exception as e:
            print(f"⚠️ Merge validation failed: {e}, using safe merge without validation")
            return left_clean.merge(right_clean, on=on, how=how)

    def _filter_data_by_date(self, df, min_date='2022-05-01'):
        """Filter out data before May 2022."""
        if df.empty:
            return df
            
        df = df.copy()
        df['date'] = pd.to_datetime(df['begin'])
        min_date = pd.to_datetime(min_date)
        
        initial_count = len(df)
        df = df[df['date'] >= min_date].copy()
        filtered_count = len(df)
        
        if initial_count > filtered_count:
            print(f"📅 Filtered out {initial_count - filtered_count} rows before {min_date}")
        
        if 'date' in df.columns and 'begin' in df.columns:
            df = df.drop(columns=['date'])
            
        return df

    def fit_transform(self, df, news_df=None, calculate_targets=True, news_cache_path=None):
        """Process training data for returns prediction."""
        print("🚀 Processing training data for returns prediction...")
        
        if df.empty:
            raise ValueError("Input DataFrame is empty")
        
        required_columns = ['begin', 'ticker', 'open', 'high', 'low', 'close', 'volume']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
        
        # Process news data
        news_embeddings = {}
        if self.use_news and news_df is not None:
            news_embeddings = self._process_news_data(news_df, cache_path=news_cache_path)
        
        processed_dfs = []
        
        for ticker in df['ticker'].unique():
            print(f"\n📊 Processing ticker: {ticker}")
            
            ticker_data = df[df['ticker'] == ticker].copy().sort_values('begin')
            
            if len(ticker_data) == 0:
                print(f"⚠️ No data for ticker {ticker}, skipping")
                continue
            
            # Create complete calendar
            regular_data = self._create_complete_calendar(ticker_data, ticker)
            
            # Calculate returns targets
            if calculate_targets:
                targets_df = self._calculate_returns_targets(regular_data, ticker)
                if len(targets_df) > 0:
                    regular_data = self._safe_merge(regular_data, targets_df, on=['begin', 'ticker'])
                    
                    # Apply stationarity correction
                    regular_data, stationary_targets = self._apply_returns_stationarity_correction(regular_data, ticker)
                    self.stationary_info[ticker] = stationary_targets
                else:
                    print(f"⚠️ No targets calculated for {ticker}")
            
            # Create leak-proof features
            regular_data = self._create_leak_proof_features(regular_data)
            
            # Add news embeddings
            if self.use_news:
                regular_data = self._add_news_embeddings(regular_data, news_embeddings)
            
            # Clean target columns - keep only stationary versions
            if calculate_targets:
                regular_data = self._clean_target_columns(regular_data)
            
            # Filter data before May 2022
            regular_data = self._filter_data_by_date(regular_data, '2022-05-01')
            
            # Keep ticker as string for CatBoost
            regular_data['ticker'] = ticker
            
            # Save state
            self.state[ticker] = {
                'last_data': regular_data.copy(),
                'last_date': regular_data['begin'].max()
            }
            
            processed_dfs.append(regular_data)
        
        if not processed_dfs:
            raise ValueError("No data processed - check input data")
        
        # Combine all data
        final_df = pd.concat(processed_dfs, ignore_index=True)
        final_df = final_df.sort_values(['begin', 'ticker']).reset_index(drop=True)
        
        # Define feature columns (ВКЛЮЧАЕМ 'ticker')
        self.feature_columns = [
            col for col in final_df.columns 
            if not col.startswith(('return_', 'stationary_return_', 'target_')) 
            and col not in ['begin', 'date', 'is_original', 'news_date']
        ]
        
        self.is_fitted = True
        print(f"\n✅ Processing complete! Created {len(self.feature_columns)} features")
        print(f"📈 Processed tickers: {len(processed_dfs)}")
        
        return final_df

    def transform(self, df, news_df=None, calculate_targets=False, news_cache_path=None):
        """Process new data incrementally for inference."""
        if not self.is_fitted:
            raise ValueError("Preprocessor must be fitted first!")
        
        print("🔄 Incremental processing of new data...")
        
        news_embeddings = {}
        if self.use_news and news_df is not None:
            news_embeddings = self._process_news_data(news_df, cache_path=news_cache_path)
        
        processed_dfs = []
        
        for ticker in df['ticker'].unique():
            if ticker not in self.state:
                print(f"⚠️ Skipping unknown ticker: {ticker}")
                continue
                
            print(f"📊 Processing ticker: {ticker}")
            
            new_ticker_data = df[df['ticker'] == ticker].copy().sort_values('begin')
            saved_state = self.state[ticker]
            last_data = saved_state['last_data']
            
            new_ticker_data = self._filter_data_by_date(new_ticker_data, '2022-05-01')
            
            last_known_date = pd.to_datetime(saved_state['last_date'])
            new_ticker_data_dates = pd.to_datetime(new_ticker_data['begin'])
            new_ticker_data = new_ticker_data[new_ticker_data_dates > last_known_date].copy()
            
            if len(new_ticker_data) == 0:
                print(f"⚠️ No new data for {ticker}")
                continue
            
            combined_data = pd.concat([last_data, new_ticker_data], ignore_index=True)
            combined_data = combined_data.drop_duplicates(
                subset=['begin', 'ticker'], keep='last'
            ).sort_values('begin').reset_index(drop=True)
            
            regular_data = self._create_complete_calendar(combined_data, ticker)
            regular_data = self._create_leak_proof_features(regular_data)
            
            if self.use_news:
                regular_data = self._add_news_embeddings(regular_data, news_embeddings)
            
            final_ticker_data = regular_data[
                pd.to_datetime(regular_data['begin']) > last_known_date
            ].copy()
            
            final_ticker_data = self._filter_data_by_date(final_ticker_data, '2022-05-01')
            final_ticker_data['ticker'] = ticker
            
            self.state[ticker] = {
                'last_data': regular_data.copy(),
                'last_date': regular_data['begin'].max()
            }
            
            processed_dfs.append(final_ticker_data)
        
        if not processed_dfs:
            print("⚠️ No new data processed")
            return pd.DataFrame()
        
        final_df = pd.concat(processed_dfs, ignore_index=True)
        final_df = final_df.sort_values(['begin', 'ticker']).reset_index(drop=True)
        
        print(f"✅ New data processed! Size: {final_df.shape}")
        return final_df

    def get_feature_names(self):
        """Get list of feature names."""
        return self.feature_columns.copy()

    def get_returns_target_names(self):
        """Get list of stationary returns target names."""
        return [f'stationary_return_{N}d' for N in self.target_periods]

    def get_final_predictions_format(self, predictions, tickers):
        """Convert model predictions to final competition format with inverse transformation."""
        results = []
        
        for i, ticker in enumerate(tickers):
            row = {'ticker': ticker}
            for j, N in enumerate(self.target_periods):
                if hasattr(predictions, 'shape') and len(predictions.shape) > 1:
                    pred_value = predictions[i, j]
                else:
                    pred_value = predictions[j] if i == 0 else predictions[j + i * len(self.target_periods)]
                
                inverse_pred = self._inverse_transform_predictions(pred_value, ticker, N)
                row[f'p{N}'] = inverse_pred
            
            results.append(row)
        
        return pd.DataFrame(results)

def train_final_catboost_model(X_train, y_train, categorical_features=['ticker']):
    """
    Train final CatBoost model for competition.
    """
    from catboost import CatBoostRegressor
    import numpy as np
    
    print("🎯 Training final CatBoost model...")
    
    cat_features_indices = [i for i, col in enumerate(X_train.columns) if col in categorical_features]
    
    model = CatBoostRegressor(
        iterations=300,
        learning_rate=0.02,
        depth=8,
        loss_function='MultiRMSE',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=10,
        thread_count=-1
    )
    
    model.fit(
        X_train, y_train,
        cat_features=cat_features_indices,
        verbose=False
    )
    
    train_pred = model.predict(X_train)
    train_mae = np.mean(np.abs(train_pred - y_train.values))
    
    print(f"✅ Final model trained!")
    print(f"   Train MAE: {train_mae:.6f}")
    
    return model, train_pred

def prepare_competition_submission(train_candles_path, train_news_path, output_path="submission.csv"):
    """
    Complete pipeline for competition submission.
    """
    print("🚀 PREPARING COMPETITION SUBMISSION")
    print("="*60)
    
    # 1. Load data
    print("📥 Loading training data...")
    train_data = pd.read_csv(train_candles_path)
    train_news = pd.read_csv(train_news_path)
    
    print(f"✅ Train candles: {train_data.shape}")
    print(f"✅ Train news: {train_news.shape}")
    print(f"📅 Date range: {train_data['begin'].min()} to {train_data['begin'].max()}")
    print(f"🎯 Tickers: {train_data['ticker'].unique()}")
    
    # 2. Initialize and process data
    print("\n🔄 Processing data with preprocessor...")
    preprocessor = AdvancedFinancialPreprocessor(
        target_periods=list(range(1, 21)),
        use_news=True,
        embedding_dim=64
    )
    
    processed_data = preprocessor.fit_transform(
        train_data, 
        news_df=train_news,
        calculate_targets=True
    )
    
    print(f"✅ Processed data: {processed_data.shape}")
    
    # 3. Prepare training and prediction data
    print("\n🎯 Preparing training and prediction data...")
    
    feature_names = preprocessor.get_feature_names()
    target_names = preprocessor.get_returns_target_names()
    
    # Sort by date
    processed_data = processed_data.sort_values('begin').reset_index(drop=True)
    unique_dates = processed_data['begin'].unique()
    
    # Remove last 20 days where targets are incomplete for training
    train_cutoff_date = unique_dates[-21] if len(unique_dates) > 20 else unique_dates[0]
    
    # Training data: exclude last 20 days
    train_mask = processed_data['begin'] <= train_cutoff_date
    # Prediction data: latest available day for each ticker
    latest_data = processed_data.groupby('ticker').last().reset_index()
    
    X_train = processed_data.loc[train_mask, feature_names]
    y_train = processed_data.loc[train_mask, target_names]
    X_pred = latest_data[feature_names]
    
    print(f"📊 Training data: {X_train.shape}")
    print(f"🎯 Targets: {y_train.shape}")
    print(f"🔮 Prediction data: {X_pred.shape}")
    print(f"📅 Training date range: {processed_data.loc[train_mask, 'begin'].min()} to {processed_data.loc[train_mask, 'begin'].max()}")
    print(f"📅 Prediction date: {latest_data['begin'].max()}")
    
    # 4. Train final model
    print("\n🎯 Training final model...")
    
    if 'ticker' not in X_train.columns:
        print("❌ ERROR: 'ticker' column missing from features!")
        return None
        
    model, train_pred = train_final_catboost_model(X_train, y_train)
    
    # 5. Make predictions
    print("\n🔮 Making predictions for the latest day...")
    
    if X_pred.empty:
        print("❌ No data for prediction!")
        return None
        
    predictions = model.predict(X_pred)
    prediction_tickers = latest_data['ticker'].tolist()
    
    final_predictions = preprocessor.get_final_predictions_format(predictions, prediction_tickers)
    
    print(f"✅ Predictions ready for {len(final_predictions)} tickers")
    print("📊 Sample predictions:")
    print(final_predictions.head())
    
    # 6. Save submission
    print(f"\n💾 Saving submission to {output_path}...")
    final_predictions.to_csv(output_path, index=False)
    
    print("🎉 SUBMISSION CREATED SUCCESSFULLY!")
    print("="*60)
    
    return {
        'preprocessor': preprocessor,
        'model': model,
        'predictions': final_predictions,
        'processed_data': processed_data
    }

def run_simple_competition_pipeline(train_candles_path, train_news_path, output_path="submission.csv"):
    """
    Simplified pipeline for quick submission.
    """
    print("🚀 SIMPLE COMPETITION PIPELINE")
    print("="*60)
    
    # Load data
    train_data = pd.read_csv(train_candles_path)
    train_news = pd.read_csv(train_news_path)
    
    print(f"📊 Loaded {len(train_data)} rows, {train_data['ticker'].nunique()} tickers")
    
    # Process data
    preprocessor = AdvancedFinancialPreprocessor(
        target_periods=list(range(1, 21)),
        use_news=True,
        embedding_dim=16
    )
    
    processed_data = preprocessor.fit_transform(
        train_data, 
        news_df=train_news,
        calculate_targets=True
    )
    
    # Get latest data for each ticker
    latest_data = processed_data.sort_values('begin').groupby('ticker').last().reset_index()
    
    feature_names = preprocessor.get_feature_names()
    target_names = preprocessor.get_returns_target_names()
    
    # Prepare training data (exclude last 20 days)
    train_cutoff = processed_data['begin'].max()
    train_data_filtered = processed_data[processed_data['begin'] < train_cutoff]
    
    if train_data_filtered.empty:
        train_data_filtered = processed_data.iloc[:-20]
    
    X_train = train_data_filtered[feature_names]
    y_train = train_data_filtered[target_names]
    X_pred = latest_data[feature_names]
    
    print(f"🎯 Training on {X_train.shape[0]} samples")
    print(f"🔮 Predicting for {X_pred.shape[0]} tickers")
    
    # Train model
    from catboost import CatBoostRegressor
    
    model = CatBoostRegressor(
        iterations=300,
        learning_rate=0.02,
        depth=8,
        loss_function='MultiRMSE',
        random_seed=42,
        verbose=False
    )
    
    cat_features_indices = [i for i, col in enumerate(X_train.columns) if col == 'ticker']
    model.fit(X_train, y_train, cat_features=cat_features_indices, verbose=False)
    
    # Make predictions
    predictions = model.predict(X_pred)
    final_predictions = preprocessor.get_final_predictions_format(
        predictions, latest_data['ticker'].tolist()
    )
    
    # Save
    final_predictions.to_csv(output_path, index=False)
    print(f"✅ Submission saved to {output_path}")
    print("📊 First few predictions:")
    print(final_predictions.head())
    
    return final_predictions

# MAIN EXECUTION
if __name__ == "__main__":
    print("🚀 FINANCIAL FORECASTING COMPETITION PIPELINE")
    print("Version: Final Competition Ready")
    print("="*60)
    
    # Use one of these options:
    
    # Option 1: Full pipeline (recommended)
    submission_result = prepare_competition_submission(
        train_candles_path="../forecast_data/candles_full.csv",  # Update path
        train_news_path="../forecast_data/news_full.csv",        # Update path  
        output_path="submission.csv"
    )
    
    # Option 2: Simple pipeline (if full has issues)
    # submission_result = run_simple_competition_pipeline(
    #     train_candles_path="../forecast_data/candles.csv",
    #     train_news_path="../forecast_data/news.csv",
    #     output_path="submission.csv"
    # )
    
    if submission_result:
        print("\n✅ SUBMISSION FILE READY: submission.csv")
        print("🎯 Ready for competition!")
    else:
        print("\n❌ SUBMISSION FAILED!")

🚀 FINANCIAL FORECASTING COMPETITION PIPELINE
Version: Final Competition Ready
🚀 PREPARING COMPETITION SUBMISSION
📥 Loading training data...
✅ Train candles: (25942, 7)
✅ Train news: (27455, 4)
📅 Date range: 2020-06-19 to 2025-09-08
🎯 Tickers: ['AFLT' 'ALRS' 'CHMF' 'GAZP' 'GMKN' 'LKOH' 'MAGN' 'MGNT' 'MOEX' 'MTSS'
 'NVTK' 'PHOR' 'PLZL' 'ROSN' 'RUAL' 'SBER' 'SIBN' 'T' 'VTBR']

🔄 Processing data with preprocessor...
🚀 Processing training data for returns prediction...
🔄 Loading news embedding model...
✅ News model loaded successfully!
📰 Processing news data with compressed embeddings...
🔧 Compressing embeddings from 384 to 64 dimensions...
✅ Generated compressed embeddings for 1820 news dates

📊 Processing ticker: AFLT
✅ AFLT, return_1d: Already stationary (p-value: 0.0000)
✅ AFLT, return_2d: Already stationary (p-value: 0.0000)
✅ AFLT, return_3d: Already stationary (p-value: 0.0000)
✅ AFLT, return_4d: Already stationary (p-value: 0.0000)
✅ AFLT, return_5d: Already stationary (p-value: 0.0

In [None]:
# import pandas as pd

# # Объединение свечных данных
# train_candles = pd.read_csv('../forecast_data/candles.csv')
# test_candles = pd.read_csv('../forecast_data/candles_2.csv')
# candles_full = pd.concat([train_candles, test_candles], ignore_index=True)
# candles_full.to_csv('../forecast_data/candles_full.csv', index=False)

# # Объединение новостных данных
# train_news = pd.read_csv('../forecast_data/news.csv')
# test_news = pd.read_csv('../forecast_data/news_2.csv')
# news_full = pd.concat([train_news, test_news], ignore_index=True)
# news_full.to_csv('../forecast_data/news_full.csv', index=False)

# print("Объединение завершено! Проверьте файлы: candles_full.csv и news_full.csv")

Объединение завершено! Проверьте файлы: candles_full.csv и news_full.csv
