In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import ta
import warnings
warnings.filterwarnings('ignore')

class AdvancedBTCPredictor:
    def __init__(self, symbol="BTC-USD", period="365d"):
        self.symbol = symbol
        self.period = period
        self.scaler = StandardScaler()
        self.feature_selector = None
        self.models = {}
        self.ensemble_model = None
        self.lstm_model = None
        
    def download_data(self):
        """Download Bitcoin data with extended history for better patterns"""
        print("📥 Downloading Bitcoin data...")
        self.df = yf.download(self.symbol, period=self.period, interval='1d')
        self.df = self.df.dropna()
        print(f"✅ Downloaded {len(self.df)} days of data")
        return self.df
    
    def create_advanced_features(self, df):
        """Create comprehensive technical indicators and features"""
        print("🔧 Creating advanced technical features...")
        
        # Price-based features
        df['Price_Change'] = df['Close'].pct_change()
        df['Price_Change_2d'] = df['Close'].pct_change(periods=2)
        df['Price_Change_3d'] = df['Close'].pct_change(periods=3)
        df['Price_Change_7d'] = df['Close'].pct_change(periods=7)
        
        # Volume features
        df['Volume_Change'] = df['Volume'].pct_change()
        df['Volume_MA_5'] = df['Volume'].rolling(5).mean()
        df['Volume_MA_20'] = df['Volume'].rolling(20).mean()
        df['Volume_Ratio'] = df['Volume'] / df['Volume_MA_20']
        
        # Moving averages and trends
        for period in [5, 10, 20, 50, 100, 200]:
            df[f'MA_{period}'] = df['Close'].rolling(period).mean()
            df[f'MA_{period}_ratio'] = df['Close'] / df[f'MA_{period}']
        
        # Exponential moving averages
        for period in [12, 26, 50]:
            df[f'EMA_{period}'] = df['Close'].ewm(span=period).mean()
            df[f'EMA_{period}_ratio'] = df['Close'] / df[f'EMA_{period}']
        
        # Technical indicators using TA library
        # RSI (Relative Strength Index)
        df['RSI_14'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
        df['RSI_30'] = ta.momentum.RSIIndicator(df['Close'], window=30).rsi()
        
        # MACD
        macd_indicator = ta.trend.MACD(df['Close'])
        df['MACD'] = macd_indicator.macd()
        df['MACD_signal'] = macd_indicator.macd_signal()
        df['MACD_histogram'] = macd_indicator.macd_diff()
        
        # Bollinger Bands
        bb_indicator = ta.volatility.BollingerBands(df['Close'], window=20, window_dev=2)
        df['BB_upper'] = bb_indicator.bollinger_hband()
        df['BB_lower'] = bb_indicator.bollinger_lband()
        df['BB_middle'] = bb_indicator.bollinger_mavg()
        df['BB_width'] = (df['BB_upper'] - df['BB_lower']) / df['BB_middle']
        df['BB_position'] = (df['Close'] - df['BB_lower']) / (df['BB_upper'] - df['BB_lower'])
        
        # Stochastic Oscillator
        stoch = ta.momentum.StochasticOscillator(df['High'], df['Low'], df['Close'])
        df['Stoch_K'] = stoch.stoch()
        df['Stoch_D'] = stoch.stoch_signal()
        
        # Average True Range (ATR)
        df['ATR'] = ta.volatility.AverageTrueRange(df['High'], df['Low'], df['Close']).average_true_range()
        
        # Williams %R
        df['Williams_R'] = ta.momentum.WilliamsRIndicator(df['High'], df['Low'], df['Close']).williams_r()
        
        # Commodity Channel Index (CCI)
        df['CCI'] = ta.trend.CCIIndicator(df['High'], df['Low'], df['Close']).cci()
        
        # Money Flow Index
        df['MFI'] = ta.volume.MFIIndicator(df['High'], df['Low'], df['Close'], df['Volume']).money_flow_index()
        
        # Rate of Change
        df['ROC'] = ta.momentum.ROCIndicator(df['Close']).roc()
        
        # Volatility features
        df['Volatility_5'] = df['Price_Change'].rolling(5).std()
        df['Volatility_10'] = df['Price_Change'].rolling(10).std()
        df['Volatility_20'] = df['Price_Change'].rolling(20).std()
        
        # High-Low features
        df['HL_ratio'] = df['High'] / df['Low']
        df['HL_pct'] = (df['High'] - df['Low']) / df['Close']
        
        # Gap features
        df['Gap'] = (df['Open'] - df['Close'].shift(1)) / df['Close'].shift(1)
        df['Gap_filled'] = np.where(
            (df['Gap'] > 0) & (df['Low'] <= df['Close'].shift(1)), 1,
            np.where((df['Gap'] < 0) & (df['High'] >= df['Close'].shift(1)), 1, 0)
        )
        
        # Lagged features (previous days' indicators)
        lag_features = ['RSI_14', 'MACD', 'BB_position', 'Volume_Ratio']
        for feature in lag_features:
            for lag in [1, 2, 3]:
                df[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
        
        # Time-based features
        df['Day_of_week'] = df.index.dayofweek
        df['Month'] = df.index.month
        df['Quarter'] = df.index.quarter
        df['Is_month_end'] = df.index.is_month_end.astype(int)
        df['Is_quarter_end'] = df.index.is_quarter_end.astype(int)
        
        # Trend strength
        df['Trend_strength'] = abs(df['Close'].rolling(20).apply(
            lambda x: np.polyfit(range(len(x)), x, 1)[0]))
        
        print(f"✅ Created {len([col for col in df.columns if col not in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']])} technical features")
        
        return df
    
    def create_target_variable(self, df, prediction_horizon=1):
        """Create target variable: 1 if price goes up, 0 if down"""
        df['Target'] = np.where(df['Close'].shift(-prediction_horizon) > df['Close'], 1, 0)
        return df
    
    def prepare_data(self, prediction_horizon=1):
        """Complete data preparation pipeline"""
        print("🎯 Preparing data for training...")
        
        # Download and prepare data
        self.download_data()
        
        # Create features
        self.df = self.create_advanced_features(self.df)
        
        # Create target
        self.df = self.create_target_variable(self.df, prediction_horizon)
        
        # Remove NaN values
        self.df = self.df.dropna()
        
        # Feature columns (exclude OHLCV and target)
        feature_cols = [col for col in self.df.columns if col not in 
                       ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Target']]
        
        X = self.df[feature_cols].values
        y = self.df['Target'].values
        
        print(f"✅ Prepared {len(feature_cols)} features and {len(X)} samples")
        print(f"📊 Target distribution: {np.bincount(y)}")
        
        return X, y, feature_cols
    
    def feature_selection(self, X, y, k=50):
        """Select best features using statistical tests"""
        print(f"🎯 Selecting top {k} features...")
        
        self.feature_selector = SelectKBest(f_classif, k=k)
        X_selected = self.feature_selector.fit_transform(X, y)
        
        print(f"✅ Selected {X_selected.shape[1]} best features")
        return X_selected
    
    def create_lstm_features(self, data, sequence_length=60):
        """Create sequences for LSTM model"""
        X_lstm = []
        for i in range(sequence_length, len(data)):
            X_lstm.append(data[i-sequence_length:i])
        return np.array(X_lstm)
    
    def build_lstm_model(self, input_shape):
        """Build advanced LSTM model"""
        model = Sequential([
            LSTM(128, return_sequences=True, input_shape=input_shape),
            Dropout(0.2),
            BatchNormalization(),
            
            LSTM(64, return_sequences=True),
            Dropout(0.2),
            BatchNormalization(),
            
            LSTM(32, return_sequences=False),
            Dropout(0.2),
            
            Dense(25, activation='relu'),
            Dropout(0.1),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def train_models(self, X, y):
        """Train multiple models and create ensemble"""
        print("🚀 Training advanced machine learning models...")
        
        # Time series split for validation
        tscv = TimeSeriesSplit(n_splits=5)
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Model configurations
        models_config = {
            'XGBoost': {
                'model': xgb.XGBClassifier(
                    n_estimators=300,
                    max_depth=6,
                    learning_rate=0.1,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                ),
                'params': {
                    'n_estimators': [200, 300, 400],
                    'max_depth': [4, 6, 8],
                    'learning_rate': [0.05, 0.1, 0.15]
                }
            },
            'LightGBM': {
                'model': lgb.LGBMClassifier(
                    n_estimators=300,
                    max_depth=6,
                    learning_rate=0.1,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=42
                ),
                'params': {
                    'n_estimators': [200, 300, 400],
                    'max_depth': [4, 6, 8],
                    'learning_rate': [0.05, 0.1, 0.15]
                }
            },
            'RandomForest': {
                'model': RandomForestClassifier(
                    n_estimators=200,
                    max_depth=10,
                    random_state=42
                ),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [8, 10, 12],
                    'min_samples_split': [2, 5, 10]
                }
            },
            'SVM': {
                'model': SVC(
                    kernel='rbf',
                    probability=True,
                    random_state=42
                ),
                'params': {
                    'C': [0.1, 1, 10],
                    'gamma': ['scale', 'auto', 0.001, 0.01]
                }
            },
            'LogisticRegression': {
                'model': LogisticRegression(random_state=42),
                'params': {
                    'C': [0.01, 0.1, 1, 10],
                    'penalty': ['l1', 'l2'],
                    'solver': ['liblinear', 'saga']
                }
            }
        }
        
        # Train and optimize each model
        for name, config in models_config.items():
            print(f"🔧 Training {name}...")
            
            # Grid search for hyperparameter optimization
            grid_search = GridSearchCV(
                config['model'],
                config['params'],
                cv=tscv,
                scoring='accuracy',
                n_jobs=-1,
                verbose=0
            )
            
            grid_search.fit(X_scaled, y)
            self.models[name] = grid_search.best_estimator_
            
            print(f"✅ {name} best accuracy: {grid_search.best_score_:.4f}")
        
        # Create ensemble model
        ensemble_models = [(name, model) for name, model in self.models.items()]
        self.ensemble_model = VotingClassifier(
            estimators=ensemble_models,
            voting='soft'
        )
        self.ensemble_model.fit(X_scaled, y)
        
        # Train LSTM model
        print("🧠 Training LSTM model...")
        sequence_length = 60
        
        if len(X_scaled) > sequence_length:
            X_lstm = self.create_lstm_features(X_scaled, sequence_length)
            y_lstm = y[sequence_length:]
            
            # Split for LSTM
            split_idx = int(0.8 * len(X_lstm))
            X_lstm_train, X_lstm_val = X_lstm[:split_idx], X_lstm[split_idx:]
            y_lstm_train, y_lstm_val = y_lstm[:split_idx], y_lstm[split_idx:]
            
            # Build and train LSTM
            self.lstm_model = self.build_lstm_model((sequence_length, X_scaled.shape[1]))
            
            callbacks = [
                EarlyStopping(patience=10, restore_best_weights=True),
                ReduceLROnPlateau(factor=0.5, patience=5)
            ]
            
            history = self.lstm_model.fit(
                X_lstm_train, y_lstm_train,
                validation_data=(X_lstm_val, y_lstm_val),
                epochs=100,
                batch_size=32,
                callbacks=callbacks,
                verbose=0
            )
            
            print(f"✅ LSTM trained successfully")
        
        print("🎉 All models trained successfully!")
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate all trained models"""
        print("\n📊 Model Evaluation Results:")
        print("=" * 50)
        
        X_test_scaled = self.scaler.transform(X_test)
        
        results = {}
        
        # Evaluate individual models
        for name, model in self.models.items():
            y_pred = model.predict(X_test_scaled)
            y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
            
            accuracy = accuracy_score(y_test, y_pred)
            auc_score = roc_auc_score(y_test, y_pred_proba)
            
            results[name] = {
                'accuracy': accuracy,
                'auc': auc_score,
                'predictions': y_pred,
                'probabilities': y_pred_proba
            }
            
            print(f"{name:15} | Accuracy: {accuracy:.4f} | AUC: {auc_score:.4f}")
        
        # Evaluate ensemble
        y_pred_ensemble = self.ensemble_model.predict(X_test_scaled)
        y_pred_proba_ensemble = self.ensemble_model.predict_proba(X_test_scaled)[:, 1]
        
        ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
        ensemble_auc = roc_auc_score(y_test, y_pred_proba_ensemble)
        
        results['Ensemble'] = {
            'accuracy': ensemble_accuracy,
            'auc': ensemble_auc,
            'predictions': y_pred_ensemble,
            'probabilities': y_pred_proba_ensemble
        }
        
        print(f"{'Ensemble':15} | Accuracy: {ensemble_accuracy:.4f} | AUC: {ensemble_auc:.4f}")
        
        # Evaluate LSTM if available
        if self.lstm_model and len(X_test_scaled) > 60:
            X_test_lstm = self.create_lstm_features(X_test_scaled, 60)
            y_test_lstm = y_test[60:]
            
            y_pred_lstm_proba = self.lstm_model.predict(X_test_lstm)
            y_pred_lstm = (y_pred_lstm_proba > 0.5).astype(int).flatten()
            
            lstm_accuracy = accuracy_score(y_test_lstm, y_pred_lstm)
            lstm_auc = roc_auc_score(y_test_lstm, y_pred_lstm_proba)
            
            results['LSTM'] = {
                'accuracy': lstm_accuracy,
                'auc': lstm_auc,
                'predictions': y_pred_lstm,
                'probabilities': y_pred_lstm_proba.flatten()
            }
            
            print(f"{'LSTM':15} | Accuracy: {lstm_accuracy:.4f} | AUC: {lstm_auc:.4f}")
        
        # Find best model
        best_model = max(results.keys(), key=lambda x: results[x]['accuracy'])
        print(f"\n🏆 Best Model: {best_model} (Accuracy: {results[best_model]['accuracy']:.4f})")
        
        return results, best_model
    
    def predict_tomorrow(self):
        """Predict tomorrow's price direction"""
        print("\n🔮 Predicting Tomorrow's Price Direction...")
        
        # Get latest features
        latest_features = self.df.iloc[-1][self.feature_cols].values.reshape(1, -1)
        
        if self.feature_selector:
            latest_features = self.feature_selector.transform(latest_features)
        
        latest_features_scaled = self.scaler.transform(latest_features)
        
        # Get predictions from all models
        predictions = {}
        
        for name, model in self.models.items():
            pred_proba = model.predict_proba(latest_features_scaled)[0][1]
            pred = int(pred_proba > 0.5)
            predictions[name] = {'prediction': pred, 'probability': pred_proba}
        
        # Ensemble prediction
        ensemble_pred_proba = self.ensemble_model.predict_proba(latest_features_scaled)[0][1]
        ensemble_pred = int(ensemble_pred_proba > 0.5)
        predictions['Ensemble'] = {'prediction': ensemble_pred, 'probability': ensemble_pred_proba}
        
        # LSTM prediction if available
        if self.lstm_model and len(self.df) > 60:
            recent_data = self.scaler.transform(
                self.df[self.feature_cols].iloc[-60:].values
            )
            recent_data = recent_data.reshape(1, 60, -1)
            
            lstm_pred_proba = self.lstm_model.predict(recent_data)[0][0]
            lstm_pred = int(lstm_pred_proba > 0.5)
            predictions['LSTM'] = {'prediction': lstm_pred, 'probability': lstm_pred_proba}
        
        # Display results
        print("\n📈 Tomorrow's Predictions:")
        print("=" * 40)
        for name, result in predictions.items():
            direction = "📈 UP" if result['prediction'] == 1 else "📉 DOWN"
            confidence = result['probability'] if result['prediction'] == 1 else (1 - result['probability'])
            print(f"{name:15} | {direction} | Confidence: {confidence:.2%}")
        
        return predictions
    
    def run_complete_analysis(self):
        """Run the complete analysis pipeline"""
        print("🚀 Starting Advanced Bitcoin Price Prediction Analysis")
        print("=" * 60)
        
        # Prepare data
        X, y, feature_cols = self.prepare_data()
        self.feature_cols = feature_cols
        
        # Feature selection
        X_selected = self.feature_selection(X, y, k=min(50, len(feature_cols)))
        
        # Train-test split (time-aware)
        split_idx = int(0.8 * len(X_selected))
        X_train, X_test = X_selected[:split_idx], X_selected[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        print(f"📊 Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
        
        # Train models
        self.train_models(X_train, y_train)
        
        # Evaluate models
        results, best_model = self.evaluate_models(X_test, y_test)
        
        # Make tomorrow's prediction
        tomorrow_predictions = self.predict_tomorrow()
        
        return results, best_model, tomorrow_predictions

# Usage Example
if __name__ == "__main__":
    # Initialize predictor
    predictor = AdvancedBTCPredictor(symbol="BTC-USD", period="730d")  # 2 years of data
    
    # Run complete analysis
    results, best_model, tomorrow_predictions = predictor.run_complete_analysis()
    
    print(f"\n🎯 Summary:")
    print(f"Best performing model: {best_model}")
    print(f"Best accuracy achieved: {results[best_model]['accuracy']:.4f}")
    
    # Trading strategy evaluation
    current_price = predictor.df['Close'].iloc[-1]
    print(f"\n💰 Current BTC Price: ${current_price:,.2f}")
    
    ensemble_prediction = tomorrow_predictions['Ensemble']
    direction = "BUY 📈" if ensemble_prediction['prediction'] == 1 else "SELL 📉"
    confidence = ensemble_prediction['probability'] if ensemble_prediction['prediction'] == 1 else (1 - ensemble_prediction['probability'])
    
    print(f"🎯 Recommendation: {direction} (Confidence: {confidence:.2%})")