# Dataset Generation for Candlestick Pattern Classification

This notebook is designed to generate a dataset for training and evaluating models aimed at candlestick pattern classification. The pipeline is as follows:

1. **Data Loading**:
   - Generate hisotrical time series data resembling financial markets.
   -  Ability to balance the dataset via buy/sell ratios and amount of no trade to store for dataset 
2. **Pattern Extraction**:
   - Apply rule-based detection for classic candlestick patterns.Similar to rule bease 
3. **Feature Engineering**:
   - Extract and preprocess features such as OHLC values and technical indicators.
   - feature validation from threshold in the window 
4. **Dataset Preparation**:
   - Label samples and format the dataset for machine learning tasks.
   - Export the dataset in formats suitable for training and evaluation.

Th Images stored are used to train the dataset 




In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import mplfinance as mpf
import shutil
import logging
import random


In [6]:

class CandlestickImageBuilder:
    def __init__(self, df, tickers, charts_dir, buy_sell_ratio=1, no_trade_fraction=1):
        """ iniitialise the class with the required parameters"""
        self.df = df
        self.tickers = tickers
        self.charts_dir = charts_dir
        self.buy_sell_ratio = buy_sell_ratio #used to balance the dataset
        self.no_trade_fraction = no_trade_fraction # used to balance the dataset

        # defensive
        self.clear_directory(charts_dir)

        # Set up logging
        logging.basicConfig(level=logging.INFO,
                            filename="candlestick_image_generation.log", 
                            filemode="w",
                            format="%(asctime)s - %(levelname)s - %(message)s")
        logging.info("CandlestickImageBuilder initialized.")

    def clear_directory(self, directory):
        """Clear the specified directory."""
        if os.path.exists(directory):
            shutil.rmtree(directory)
        os.makedirs(directory, exist_ok=True)
        logging.info(f"Initialized directory: {directory}")

    def extract_ticker_data(self, ticker):
        """Extract OHLC data for a specific ticker."""
        columns = [col for col in self.df.columns if col.endswith(f"_{ticker}")]
        if len(columns) != 4:
            logging.error(f"Ticker {ticker} does not have the required columns. Found: {columns}")
            raise ValueError(f"Ticker {ticker} does not have the required OHLC columns.")

        ticker_data = self.df[columns].copy()
        ticker_data.columns = ['Open', 'High', 'Low', 'Close']
        ticker_data.dropna(inplace=True)

        return ticker_data

    def detect_pattern(self, data, threshold=0.05, tolerance=0.01):
        """
        Detect Buy, Sell, or No Trade patterns based on candlestick patterns and price changes.
        """
        if len(data) < 50:  # Ensure the window is large enough
            return None

        # Scan through the 50-candle window for Morning Star and Evening Star patterns
        for i in range(len(data) - 7):  # Leave room for price check
            first, second, third = data.iloc[i], data.iloc[i + 1], data.iloc[i + 2]

            # Morning Star Detection (Buy)
            if (
                first['Close'] < first['Open'] and
                abs(second['Close'] - second['Open']) < 0.3 * (second['High'] - second['Low']) and
                third['Close'] > third['Open'] and
                third['Close'] > (first['Close'] + first['Open']) / 2
            ):
                # Check price change 5 candles after the detected pattern
                if i + 7 < len(data):
                    future_price = data['Close'].iloc[i + 7]
                    current_price = third['Close']
                    price_change = (future_price - current_price) / current_price
                    if price_change > threshold:
                        return "Buy"
                    elif price_change < -threshold:
                        return "Sell"

            # Evening Star Detection (Sell)
            if (
                first['Close'] > first['Open'] and
                abs(second['Close'] - second['Open']) < 0.3 * (second['High'] - second['Low']) and
                third['Close'] < third['Open'] and
                third['Close'] < (first['Close'] + first['Open']) / 2
            ):
                # Check price change 5 candles after the detected pattern
                if i + 7 < len(data):
                    future_price = data['Close'].iloc[i + 7]
                    current_price = third['Close']
                    price_change = (future_price - current_price) / current_price
                    if price_change > threshold:
                        return "Buy"
                    elif price_change < -threshold:
                        return "Sell"

        # Threshold-based No Trade
        start_price = data['Close'].iloc[0]
        end_price = data['Close'].iloc[-1]
        price_change = (end_price - start_price) / start_price
        if abs(price_change) <= tolerance:
            return "No_Trade"

        return None

    def generate_images(self, ticker=None, window=50, threshold=0.05, tolerance=0.01):
        """Generate labeled candlestick charts for CNN training."""
        buy_count, sell_count, no_trade_count = 0, 0, 0

        tickers_to_process = [ticker] if ticker else self.tickers

        for ticker in tickers_to_process:
            try:
                ticker_data = self.extract_ticker_data(ticker)
            except ValueError as e:
                logging.error(e)
                continue

            for i in range(window, len(ticker_data)):
                subset = ticker_data.iloc[i - window:i]
                label = self.detect_pattern(subset, threshold=threshold, tolerance=tolerance)

                # Skip No_Trade examples based on the fraction limit
                if label == "No_Trade":
                    if random.random() > self.no_trade_fraction:
                        continue
                    no_trade_count += 1

                elif label == "Buy":
                    buy_count += 1

                elif label == "Sell":
                    sell_count += 1

                # Skip saving if the dataset is already balanced
                if buy_count > sell_count * self.buy_sell_ratio and label == "Buy":
                    continue
                if sell_count > buy_count * self.buy_sell_ratio and label == "Sell":
                    continue

                # Save the chart
                if label:
                    label_dir = os.path.join(self.charts_dir, label)
                    os.makedirs(label_dir, exist_ok=True)
                    chart_path = os.path.join(label_dir, f"{ticker}_{i}.png")

                    try:
                        fig, ax = mpf.plot(
                            subset,
                            type='candle',
                            style='yahoo',
                            returnfig=True,
                            volume=False,
                            datetime_format='%Y-%m-%d',
                            figsize=(6, 4)
                        )
                        fig.savefig(chart_path)
                        plt.close(fig)
                    except Exception as e:
                        logging.error(f"Failed to generate chart for {ticker} at {subset.index[-1]}: {e}")

        # Log dataset balance
        logging.info(f"Ticker {ticker}: Buy={buy_count}, Sell={sell_count}, No_Trade={no_trade_count}")
        return buy_count, sell_count, no_trade_count



In [7]:
# datt and output directory
data_file = "indices_etfs_dataset.csv"
charts_dir = "charts/candlestick3"

# Load dataset
df = pd.read_csv(data_file, low_memory=False)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dropna(subset=['Date'], inplace=True)  # Remove rows with invalid dates
df.set_index('Date', inplace=True)
tickers = list({col.split("_")[-1] for col in df.columns if "_" in col})

builder = CandlestickImageBuilder(df, tickers, charts_dir)

running_totals = {"Buy": 0, "Sell": 0, "No_Trade": 0}

# Generate charts and track totals
for ticker in tickers:
    print(f"Processing ticker: {ticker}")
    try:
        buy_count, sell_count, no_trade_count = builder.generate_images(
            window=50, threshold=0.05, tolerance=0.01, ticker=ticker
        )
        running_totals["Buy"] += buy_count
        running_totals["Sell"] += sell_count
        running_totals["No_Trade"] += no_trade_count

        print(
            f"Running Totals -> Buy: {running_totals['Buy']}, "
            f"Sell: {running_totals['Sell']}, No Trade: {running_totals['No_Trade']}"
        )

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
print(running_totals)

Processing ticker: SPY
Running Totals -> Buy: 157, Sell: 186, No Trade: 280
Processing ticker: ^DJI
Running Totals -> Buy: 282, Sell: 512, No Trade: 615
Processing ticker: QQQ
Running Totals -> Buy: 606, Sell: 913, No Trade: 795
Processing ticker: TLT
Running Totals -> Buy: 778, Sell: 959, No Trade: 1126
Processing ticker: IWM
Running Totals -> Buy: 1156, Sell: 1216, No Trade: 1403
Processing ticker: USO
Running Totals -> Buy: 1900, Sell: 2102, No Trade: 1465
Processing ticker: ^GSPC
Running Totals -> Buy: 2054, Sell: 2262, No Trade: 1768
Processing ticker: ^FTSE
Running Totals -> Buy: 2130, Sell: 2348, No Trade: 2333
Processing ticker: GLD
Running Totals -> Buy: 2302, Sell: 2348, No Trade: 2649
Processing ticker: ^IXIC
Running Totals -> Buy: 2660, Sell: 2641, No Trade: 2865
{'Buy': 2660, 'Sell': 2641, 'No_Trade': 2865}
