In [1]:
# =============================================================================
# PROJECT: Multi-Sector Stock Market Analysis (HTML Dashboard Suite)
# MODULE: Automated Preprocessing, Multi-Modeling & HTML Export
# AUTHOR: Gemini Thought Partner
# =============================================================================

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import joblib

# Modeling
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Reproducibility
SEED = 42
np.random.seed(SEED)

# =============================================================================
# 1. DATA PREPROCESSING & FEATURE ENGINEERING
# =============================================================================
def load_and_clean_data(file_path, ticker):
    df = pd.read_csv(file_path)
    if 'Price' in df.columns: df.rename(columns={'Price': 'Close'}, inplace=True)
    if 'Vol.' in df.columns: df.rename(columns={'Vol.': 'Volume'}, inplace=True)
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df.sort_values('Date', inplace=True)

    # Clean numeric strings
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str).str.replace(',', '').str.replace('%', '')
            df[col] = pd.to_numeric(df[col], errors='ignore')

    df.fillna(method='ffill', inplace=True)
    df['Ticker'] = ticker
    return df

def feature_engineering(df):
    df = df.copy()
    df['Daily_Return'] = df['Close'].pct_change()
    df['Volatility_7D'] = df['Daily_Return'].rolling(window=7).std()
    df['MA_50'] = df['Close'].rolling(window=50).mean()

    # RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    df['RSI'] = 100 - (100 / (1 + (gain / loss)))

    # Target: 5-Day Trend
    df['Future_Return'] = df['Close'].shift(-5) / df['Close'] - 1
    df['Target'] = 0
    df.loc[df['Future_Return'] > 0.02, 'Target'] = 1
    df.loc[df['Future_Return'] < -0.02, 'Target'] = -1

    return df[df['Target'] != 0].dropna()

# =============================================================================
# 2. HTML DASHBOARD GENERATOR (INTERACTIVE)
# =============================================================================
def save_html_dashboard(df, ticker, model_name, y_test, y_pred, acc):
    """Creates and saves a standalone interactive HTML dashboard."""

    # Create Layout (2x2)
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            "Market Regimes (Interactive Price)",
            f"Confusion Matrix (Acc: {acc:.1%})",
            "RSI Distribution",
            "Volatility Trend"
        ),
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )

    # 1. Market Regimes
    fig.add_trace(
        go.Scatter(x=df['Date'], y=df['Close'], mode='markers',
                   marker=dict(color=df['Cluster'], colorscale='Viridis', size=6),
                   name="Price Regime"),
        row=1, col=1
    )

    # 2. Confusion Matrix Heatmap
    cm = confusion_matrix(y_test, y_pred)
    fig.add_trace(
        go.Heatmap(z=cm, x=['Pred -1', 'Pred 1'], y=['Actual -1', 'Actual 1'],
                   colorscale='Blues', showscale=False, text=cm, texttemplate="%{text}"),
        row=1, col=2
    )

    # 3. RSI Indicator
    fig.add_trace(
        go.Histogram(x=df['RSI'], nbinsx=30, marker_color='orchid', name="RSI Dist"),
        row=2, col=1
    )

    # 4. Volatility Trend
    fig.add_trace(
        go.Scatter(x=df['Date'], y=df['Volatility_7D'], line=dict(color='red', width=1.5), name="Volatility"),
        row=2, col=2
    )

    fig.update_layout(
        height=900, width=1200,
        title_text=f"Stock Analysis Dashboard: {ticker} | Model: {model_name}",
        showlegend=False,
        template="plotly_white"
    )

    # Save to File
    file_name = f"{ticker}_{model_name.replace(' ', '_')}_Dashboard.html"
    fig.write_html(file_name)
    print(f"    [✔] Saved HTML: {file_name}")

# =============================================================================
# 3. AUTOMATED PIPELINE EXECUTION
# =============================================================================
datasets = ["FAUJI.csv", "OGDCL.csv"]

for file in datasets:
    if not os.path.exists(file):
        print(f"Skipping {file}: File not found in directory.")
        continue

    ticker = os.path.splitext(file)[0]
    print(f"\n{'='*50}\nRUNNING PIPELINE FOR: {ticker}\n{'='*50}")

    # Load and Cluster
    df_raw = load_and_clean_data(file, ticker)
    df_eng = feature_engineering(df_raw)

    scaler_clus = StandardScaler()
    df_eng['Cluster'] = KMeans(n_clusters=3, random_state=SEED).fit_predict(
        scaler_clus.fit_transform(df_eng[['Volatility_7D']])
    )

    # Supervised Learning Data Setup
    features = ['RSI', 'Volatility_7D', 'MA_50']
    X = df_eng[features]
    y = df_eng['Target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

    scaler_sup = StandardScaler()
    X_train_s = scaler_sup.fit_transform(X_train)
    X_test_s = scaler_sup.transform(X_test)

    # Model Dictionary
    rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
    svm = SVC(kernel='rbf', probability=True, random_state=SEED)
    lr = LogisticRegression(random_state=SEED)

    models = {
        "Random Forest": rf,
        "SVM": svm,
        "Logistic Regression": lr,
        "Voting Ensemble": VotingClassifier(
            estimators=[('rf', rf), ('svm', svm), ('lr', lr)], voting='hard'
        )
    }

    # Execute and Store for each Model
    for name, model in models.items():
        print(f"  > Processing {name}...")
        model.fit(X_train_s, y_train)
        y_pred = model.predict(X_test_s)
        acc = accuracy_score(y_test, y_pred)

        # 1. Generate & Save Interactive HTML Dashboard
        save_html_dashboard(df_eng, ticker, name, y_test, y_pred, acc)

        # 2. Store Model weights
        joblib.dump(model, f"{ticker}_{name.replace(' ', '_')}_Model.pkl")

print("\n=== PROCESSING COMPLETE: 8 HTML DASHBOARDS AND 8 MODELS SAVED ===")


RUNNING PIPELINE FOR: FAUJI
  > Processing Random Forest...
    [✔] Saved HTML: FAUJI_Random_Forest_Dashboard.html
  > Processing SVM...
    [✔] Saved HTML: FAUJI_SVM_Dashboard.html
  > Processing Logistic Regression...
    [✔] Saved HTML: FAUJI_Logistic_Regression_Dashboard.html
  > Processing Voting Ensemble...
    [✔] Saved HTML: FAUJI_Voting_Ensemble_Dashboard.html

RUNNING PIPELINE FOR: OGDCL
  > Processing Random Forest...
    [✔] Saved HTML: OGDCL_Random_Forest_Dashboard.html
  > Processing SVM...
    [✔] Saved HTML: OGDCL_SVM_Dashboard.html
  > Processing Logistic Regression...
    [✔] Saved HTML: OGDCL_Logistic_Regression_Dashboard.html
  > Processing Voting Ensemble...
    [✔] Saved HTML: OGDCL_Voting_Ensemble_Dashboard.html

=== PROCESSING COMPLETE: 8 HTML DASHBOARDS AND 8 MODELS SAVED ===
