In [1]:
import pandas as pd
import numpy as np

def analyze_sentiment_data(df_sentiments, save_to_csv=False, path=''):
    """
    Analyze sentiment data at minute, hourly and daily granularity, optionally saving the results to CSV.
    
    Parameters:
    df_sentiments (pd.DataFrame): DataFrame containing sentiment data with columns:
        ticker, relevance, sentiment, confidence, prob_pos, prob_ntr, prob_neg,
        reddit_topic, topicweight, source, sourceweight, author, novelty, comment_count
        post_time is assumed to be the index column.
    save_to_csv (bool): If True, save the processed dataframes to CSV files.
    path: Path to save the data frames if `save_to_csv` is `True`
    Returns:
    tuple: (ticker_sentiment_minute, market_sentiment_minute, ticker_sentiment_hour, market_sentiment_hour,
     ticker_sentiment_daily, market_sentiment_daily) 
           DataFrames with minute, hourly and daily-level aggregations
    """
    # Reset index if post_time is the index
    if 'post_time' not in df_sentiments.columns:
        df_sentiments = df_sentiments.reset_index()

    # Select relevant columns
    df_sentiments = df_sentiments.loc[:, ['post_time', 'ticker',
        'relevance', 'sentiment', 'confidence', 'prob_pos', 'prob_ntr',
        'prob_neg', 'reddit_topic', 'topicweight', 'source',
        'sourceweight', 'author', 'novelty', 'comment_count']]
    df_sentiments['post_time'] = pd.to_datetime(df_sentiments['post_time'], format="%Y-%m-%d %H:%M:%S")


    # --- Minute-Level Aggregation ---

    # Round timestamps to the nearest minute
    df_sentiments['minute'] = df_sentiments['post_time'].dt.floor('min')

    # Sentiment by ticker and minute
    ticker_sentiment_minute = df_sentiments.groupby(['minute', 'ticker']).agg(
        obs_count=('prob_pos', 'size'),
        mean_pos=('prob_pos', 'mean'),
        mean_ntr=('prob_ntr', 'mean'),
        mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Market-wide sentiment by minute
    market_sentiment_minute = df_sentiments.groupby('minute').agg(
        market_mean_pos=('prob_pos', 'mean'),
        market_mean_ntr=('prob_ntr', 'mean'),
        market_mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # market_weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # market_weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # market_weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Drop rows where no sentiment was observed
    ticker_sentiment_minute = ticker_sentiment_minute.dropna(subset=['obs_count'])

    # Filter out tickers with insufficient observations (adjusted for minute-level data)
    minutes_threshold = 2000  # Adjust this value based on your needs
    ticker_counts_minute = ticker_sentiment_minute.groupby("ticker")["obs_count"].count()
    tickers_over_threshold_minute = ticker_counts_minute[ticker_counts_minute > minutes_threshold].index
    ticker_sentiment_minute = ticker_sentiment_minute[ticker_sentiment_minute['ticker'].isin(tickers_over_threshold_minute)]

    # --- Hourly-Level Aggregation ---

    # Round timestamps to the nearest hour
    df_sentiments['time'] = df_sentiments['post_time'].dt.floor('h')

    # Sentiment by ticker and hour
    ticker_sentiment_hour = df_sentiments.groupby(['time', 'ticker']).agg(
        obs_count=('prob_pos', 'size'),
        mean_pos=('prob_pos', 'mean'),
        mean_ntr=('prob_ntr', 'mean'),
        mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Market-wide sentiment by hour
    market_sentiment_hour = df_sentiments.groupby('time').agg(
        market_mean_pos=('prob_pos', 'mean'),
        market_mean_ntr=('prob_ntr', 'mean'),
        market_mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # market_weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # market_weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # market_weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Drop rows where no sentiment was observed
    ticker_sentiment_hour = ticker_sentiment_hour.dropna(subset=['obs_count'])

     # Filter out tickers with insufficient observations (adjusted for hourly-level data)
    hours_threshold = 500 #Adjust this value based on your needs
    ticker_counts_hour = ticker_sentiment_hour.groupby("ticker")["obs_count"].count()
    tickers_over_threshold_hour = ticker_counts_hour[ticker_counts_hour > hours_threshold].index
    ticker_sentiment_hour = ticker_sentiment_hour[ticker_sentiment_hour['ticker'].isin(tickers_over_threshold_hour)]

    # --- Daily-Level Aggregation ---

    # Round timestamps to the nearest hour
    df_sentiments['date'] = df_sentiments['post_time'].dt.date

    # Sentiment by ticker and hour
    ticker_sentiment_daily = df_sentiments.groupby(['date', 'ticker']).agg(
        obs_count=('prob_pos', 'size'),
        mean_pos=('prob_pos', 'mean'),
        mean_ntr=('prob_ntr', 'mean'),
        mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Market-wide sentiment by hour
    market_sentiment_daily = df_sentiments.groupby('date').agg(
        market_mean_pos=('prob_pos', 'mean'),
        market_mean_ntr=('prob_ntr', 'mean'),
        market_mean_neg=('prob_neg', 'mean'),
        # Uncomment and adjust if you want weighted calculations
        # market_weighted_pos=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_pos'] * x) / x.sum()),
        # market_weighted_ntr=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_ntr'] * x) / x.sum()),
        # market_weighted_neg=('confidence', lambda x: np.sum(df_sentiments.loc[x.index, 'prob_neg'] * x) / x.sum())
    ).reset_index()

    # Drop rows where no sentiment was observed
    ticker_sentiment_daily = ticker_sentiment_daily.dropna(subset=['obs_count'])

     # Filter out tickers with insufficient observations (adjusted for daily-level data)
    daily_threshold = 10 #Adjust this value based on your needs
    ticker_counts_daily = ticker_sentiment_daily.groupby("ticker")["obs_count"].count()
    tickers_over_threshold_daily = ticker_counts_daily[ticker_counts_daily > daily_threshold].index
    ticker_sentiment_daily = ticker_sentiment_daily[ticker_sentiment_daily['ticker'].isin(tickers_over_threshold_daily)]

    if save_to_csv:
        ticker_sentiment_minute.to_csv(f'{path}ticker_sentiment_minute.csv', index=False)
        market_sentiment_minute.to_csv(f'{path}market_sentiment_minute.csv', index=False)
        ticker_sentiment_hour.to_csv(f'{path}ticker_sentiment_hour.csv', index=False)
        market_sentiment_hour.to_csv(f'{path}market_sentiment_hour.csv', index=False)
        ticker_sentiment_daily.to_csv(f'{path}ticker_sentiment_daily.csv', index=False)
        market_sentiment_daily.to_csv(f'{path}market_sentiment_daily.csv', index=False)

    return ticker_sentiment_minute, market_sentiment_minute, ticker_sentiment_hour, market_sentiment_hour, ticker_sentiment_daily, market_sentiment_daily

In [2]:
# Load your sentiment data
df_sentiments = pd.read_csv("sentiment_raw2.csv", index_col=0)

# Define path for saved csvs
path = "cleaned_data/"

# Create the path if it doesn't exist
import os
os.makedirs(path, exist_ok = True)


# Process and save sentiment data to csv
ticker_sentiment_minute, market_sentiment_minute, ticker_sentiment_hour, market_sentiment_hour, ticker_sentiment_daily, market_sentiment_daily = analyze_sentiment_data(df_sentiments, save_to_csv=True, path = path)
print("Sentiment data processed and saved to CSVs.")

Sentiment data processed and saved to CSVs.


In [3]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime, timedelta
from tqdm import tqdm
import os

def fetch_and_save_price_data(symbol, start_date, end_date, interval, path='price_data/'):
    """
    Fetch historical price data from Binance and save it to a CSV file.

    Parameters:
    symbol (str): The trading pair (e.g., 'BTCUSDT').
    start_date (str): Start date in 'YYYY-MM-DD' format.
    end_date (str): End date in 'YYYY-MM-DD' format.
    interval (str): The time interval ('1m', '1h', '1d').
    path (str): Directory to save the price data
    """
    url = 'https://api.binance.com/api/v3/klines'

    # Convert dates to timestamps in milliseconds
    start_ts = int(pd.Timestamp(start_date).timestamp() * 1000)
    end_ts = int(pd.Timestamp(end_date).timestamp() * 1000)

    all_data = []
    current_ts = start_ts

    # Calculate total number of days for progress bar
    total_days = (pd.Timestamp(end_date) - pd.Timestamp(start_date)).days
    
    
    with tqdm(total=total_days, desc=f"Fetching {symbol} {interval} data") as pbar:
      while current_ts < end_ts:
        # Parameters for API call
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': current_ts,
            'limit': 1000  # Maximum allowed by Binance
        }
        
        try:
          response = requests.get(url, params=params)
          response.raise_for_status()  # Raise exception for bad status codes
          data = response.json()
          
          if not data:
              break
              
          all_data.extend(data)
          
          # Update timestamp for next iteration
          if interval == '1m':
              current_ts = data[-1][0] + 60000  # Add one minute in milliseconds
          elif interval == '1h':
              current_ts = data[-1][0] + 60 * 60 * 1000  # Add one hour in milliseconds
          elif interval == '1d':
              current_ts = data[-1][0] + 60 * 60 * 24 * 1000 #Add one day in milliseconds
          
          # Update progress bar (approximately)
          pbar.update(1)
          
          # Respect rate limits
          time.sleep(0.1)  # 10 requests per second should be safe
          
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {symbol}: {e}")
            time.sleep(5)  # Wait longer on error
            continue

    # Convert to DataFrame
    df = pd.DataFrame(all_data, columns=[
        'timestamp', 'open', 'high', 'low', 'close', 
        'volume', 'close_time', 'quote_volume', 'trades',
        'taker_buy_volume', 'taker_buy_quote_volume', 'ignore'
    ])

    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

    if interval == '1m':
        df['time'] = df['timestamp'].dt.floor('min')
    elif interval == '1h':
        df['time'] = df['timestamp'].dt.floor('h')
    elif interval == '1d':
      df['time'] = df['timestamp'].dt.floor('d')
    
    # Convert price and volume columns to float
    for col in ['open', 'high', 'low', 'close', 'volume', 'quote_volume']:
        df[col] = df[col].astype(float)

    # Save to CSV
    os.makedirs(path, exist_ok=True)
    df.to_csv(f'{path}{symbol}_{interval}.csv', index=False)
    print(f"Price data for {symbol} at {interval} interval saved to {path}{symbol}_{interval}.csv")
    
    return df

* Now we fetch price data and save to CSV:

In [4]:
# Define date range and symbols
start_date = '2024-01-01'
end_date = '2024-08-31'
symbols = ['BTCUSDT', 'ETHUSDT']
intervals = ['1m', '1h', '1d'] # Minute, hour, daily intervals

# Fetch data and store it
for symbol in symbols:
  for interval in intervals:
    fetch_and_save_price_data(symbol, start_date, end_date, interval)

Fetching BTCUSDT 1m data: 350it [12:24,  2.13s/it]                         


Price data for BTCUSDT at 1m interval saved to price_data/BTCUSDT_1m.csv


Fetching BTCUSDT 1h data:   2%|▏         | 6/243 [00:12<08:25,  2.13s/it]


Price data for BTCUSDT at 1h interval saved to price_data/BTCUSDT_1h.csv


Fetching BTCUSDT 1d data:   0%|          | 1/243 [00:02<09:09,  2.27s/it]


Price data for BTCUSDT at 1d interval saved to price_data/BTCUSDT_1d.csv


Fetching ETHUSDT 1m data: 350it [12:15,  2.10s/it]                         


Price data for ETHUSDT at 1m interval saved to price_data/ETHUSDT_1m.csv


Fetching ETHUSDT 1h data:   2%|▏         | 6/243 [00:12<08:25,  2.13s/it]


Price data for ETHUSDT at 1h interval saved to price_data/ETHUSDT_1h.csv


Fetching ETHUSDT 1d data:   0%|          | 1/243 [00:01<07:48,  1.93s/it]


Price data for ETHUSDT at 1d interval saved to price_data/ETHUSDT_1d.csv


Fetching DOGEUSDT 1m data:  57%|█████▋    | 138/243 [28:15<21:30, 12.29s/it]


KeyboardInterrupt: 

In [5]:
# Define date range and symbols
start_date = '2024-01-01'
end_date = '2024-08-31'
symbols = ['DOGEUSDT']
intervals = ['1m', '1h', '1d'] # Minute, hour, daily intervals

# Fetch data and store it
for symbol in symbols:
  for interval in intervals:
    fetch_and_save_price_data(symbol, start_date, end_date, interval)

Fetching DOGEUSDT 1m data: 350it [13:02,  2.23s/it]                         


Price data for DOGEUSDT at 1m interval saved to price_data/DOGEUSDT_1m.csv


Fetching DOGEUSDT 1h data:   2%|▏         | 6/243 [00:12<08:29,  2.15s/it]


Price data for DOGEUSDT at 1h interval saved to price_data/DOGEUSDT_1h.csv


Fetching DOGEUSDT 1d data:   0%|          | 1/243 [00:01<07:13,  1.79s/it]

Price data for DOGEUSDT at 1d interval saved to price_data/DOGEUSDT_1d.csv





In [7]:
import pandas as pd
import numpy as np
import os


def analyze_price_sentiment_relationship(price_df, sentiment_df, symbol, interval):
    """
    Analyze the relationship between price movements and sentiment
    
    Parameters:
    - price_df: DataFrame with price data
    - sentiment_df: DataFrame with sentiment data
    - symbol: crypto symbol (e.g., 'BTC')
    - interval: '1m', '1h', '1d'

    Returns:
    - correlations: A dict where keys are lags and values are the correlation coefficients

    """

    # Merge price and sentiment data
    merged_df = pd.merge(
      price_df,
      sentiment_df,
      on='time',
      how='inner'
    )
    
    # Calculate price returns
    merged_df['ret'] = merged_df['close'].pct_change()
    
    # Calculate correlations at different lags
    correlations = {}
    if interval == '1m':
      for lag in range(-5, 6):  # -5 to +5 minutes
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}m'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}m'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1h':
      for lag in range(-2, 3):  # -2 to +2 hours
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}h'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}h'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1d':
        corr = merged_df['mean_pos'].corr(merged_df['ret'])
        correlations['concurrent'] = corr

    return correlations



# Define path for the price and sentiment data
price_data_path = "price_data/"
sentiment_data_path = "cleaned_data/"

# Define tokens and intervals
symbols = ['BTCUSDT', 'ETHUSDT', 'DOGEUSDT']
intervals = ['1m', '1h', '1d'] # Minute, hour, daily intervals

# Analysis function per token
for symbol in symbols:
    print(f"\nAnalyzing {symbol}...")
    
    for price_interval in intervals:
        print(f"  Price Interval: {price_interval}")
        
        # Load price data
        price_df = pd.read_csv(f'{price_data_path}{symbol}_{price_interval}.csv')
        price_df['time'] = pd.to_datetime(price_df['time'])

        
        for sentiment_interval in intervals:
            print(f"    Sentiment Interval: {sentiment_interval}")
            
            # Load sentiment data
            if sentiment_interval == '1m':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['minute'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1h':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_hour.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1d':
                sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_daily.csv')
                sentiment_df['time'] = pd.to_datetime(sentiment_df['date']).dt.date
                sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
                sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]

            # Analyze relationships
            correlations = analyze_price_sentiment_relationship(price_df, sentiment_df, symbol.replace('USDT', ''), price_interval)
            
            # Print results
            print(f"       Correlations for {symbol} using {price_interval} price data and {sentiment_interval} sentiment data:")
            for k, v in correlations.items():
                print(f"          {k}: {v:.4f}")


Analyzing BTCUSDT...
  Price Interval: 1m
    Sentiment Interval: 1m
       Correlations for BTCUSDT using 1m price data and 1m sentiment data:
          sentiment_leads_5m: 0.0028
          sentiment_leads_4m: 0.0046
          sentiment_leads_3m: -0.0010
          sentiment_leads_2m: 0.0024
          sentiment_leads_1m: 0.0062
          concurrent: 0.0009
          price_leads_1m: -0.0014
          price_leads_2m: 0.0015
          price_leads_3m: 0.0022
          price_leads_4m: 0.0007
          price_leads_5m: 0.0021
    Sentiment Interval: 1h
       Correlations for BTCUSDT using 1m price data and 1h sentiment data:
          sentiment_leads_5m: 0.0015
          sentiment_leads_4m: 0.0298
          sentiment_leads_3m: 0.0352
          sentiment_leads_2m: 0.0315
          sentiment_leads_1m: 0.0389
          concurrent: 0.0341
          price_leads_1m: 0.0465
          price_leads_2m: 0.0059
          price_leads_3m: -0.0027
          price_leads_4m: -0.0169
          price_leads_5m

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


       Correlations for DOGEUSDT using 1d price data and 1m sentiment data:
          concurrent: nan
    Sentiment Interval: 1h
       Correlations for DOGEUSDT using 1d price data and 1h sentiment data:
          concurrent: -0.0729
    Sentiment Interval: 1d
       Correlations for DOGEUSDT using 1d price data and 1d sentiment data:
          concurrent: -0.0616


In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os


def analyze_price_sentiment_relationship(price_df, sentiment_df, symbol, interval):
    """
    Analyze the relationship between price movements and sentiment
    
    Parameters:
    - price_df: DataFrame with price data
    - sentiment_df: DataFrame with sentiment data
    - symbol: crypto symbol (e.g., 'BTC')
    - interval: '1m', '1h', '1d'

    Returns:
    - correlations: A dict where keys are lags and values are the correlation coefficients

    """

    # Merge price and sentiment data
    merged_df = pd.merge(
      price_df,
      sentiment_df,
      on='time',
      how='inner'
    )
    
    # Calculate price returns
    merged_df['ret'] = merged_df['close'].pct_change()
    
    # Calculate correlations at different lags
    correlations = {}
    if interval == '1m':
      for lag in range(-5, 6):  # -5 to +5 minutes
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}m'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}m'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1h':
      for lag in range(-2, 3):  # -2 to +2 hours
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}h'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}h'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1d':
        corr = merged_df['mean_pos'].corr(merged_df['ret'])
        correlations['concurrent'] = corr

    return correlations


def create_correlation_heatmap(symbol, price_intervals, sentiment_intervals, price_data_path, sentiment_data_path, save_path='visuals/'):
    """
    Generates and saves correlation heatmaps for different time intervals.

    Parameters:
        symbol (str): Cryptocurrency symbol ('BTCUSDT', 'ETHUSDT', 'DOGEUSDT').
        price_intervals (list): List of price intervals ('1m', '1h', '1d').
        sentiment_intervals (list): List of sentiment intervals ('1m', '1h', '1d').
        price_data_path (str): Path to the directory with price data CSVs.
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
    """
    
    all_correlations = {}

    for price_interval in price_intervals:
        all_correlations[price_interval] = {}
        # Load price data
        price_df = pd.read_csv(f'{price_data_path}{symbol}_{price_interval}.csv')
        price_df['time'] = pd.to_datetime(price_df['time'])


        for sentiment_interval in sentiment_intervals:
            # Load sentiment data
            if sentiment_interval == '1m':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['minute'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1h':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_hour.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1d':
                sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_daily.csv')
                sentiment_df['time'] = pd.to_datetime(sentiment_df['date'])
                sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
                

            # Analyze relationships
            correlations = analyze_price_sentiment_relationship(price_df, sentiment_df, symbol.replace('USDT', ''), price_interval)
            
            all_correlations[price_interval][sentiment_interval] = correlations

    # Convert to DataFrame for Heatmap
    data = []
    for price_interval, sent_intervals in all_correlations.items():
      for sent_interval, corrs in sent_intervals.items():
        for key, value in corrs.items():
          data.append({'Price Interval':price_interval,
                       'Sentiment Interval':sent_interval,
                       'Lag':key,
                       'Correlation':value})

    df_heatmap = pd.DataFrame(data)

    # Pivot the dataframe to create the heatmap
    df_heatmap = df_heatmap.pivot_table(index = ['Price Interval', 'Lag'], columns='Sentiment Interval', values='Correlation')

    # Create the heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_heatmap, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(f"Correlation Heatmap for {symbol}")
    plt.tight_layout()

    # Save heatmap
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f'{save_path}{symbol}_correlation_heatmap.png')
    plt.close()  # Close the figure to free up memory

    print(f"Heatmap for {symbol} saved to {save_path}{symbol}_correlation_heatmap.png")

def create_top_correlations_table(symbol, price_data_path, sentiment_data_path, save_path='visuals/'):
    """
    Generates a table of the top 5 positive and negative correlations between price and sentiment
    at the daily level, and saves it to a png file.

    Parameters:
        symbol (str): Cryptocurrency symbol ('BTCUSDT', 'ETHUSDT', 'DOGEUSDT').
        price_data_path (str): Path to the directory with price data CSVs.
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
        save_path (str): Path to save the output.
    """
    # Load data
    price_interval = '1d'
    price_df = pd.read_csv(f'{price_data_path}{symbol}_{price_interval}.csv')
    price_df['time'] = pd.to_datetime(price_df['time'])
    all_correlations = {}
    for sentiment_interval in ['1m', '1h', '1d']:
            # Load sentiment data
            if sentiment_interval == '1m':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['minute'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1h':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_hour.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1d':
                sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_daily.csv')
                sentiment_df['time'] = pd.to_datetime(sentiment_df['date'])
                sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
                
            # Analyze relationships
            correlations = analyze_price_sentiment_relationship(price_df, sentiment_df, symbol.replace('USDT', ''), price_interval)
            
            all_correlations[sentiment_interval] = correlations

    # Convert to DataFrame for Heatmap
    data = []
    for sent_interval, corrs in all_correlations.items():
      for key, value in corrs.items():
        data.append({'Sentiment Interval':sent_interval,
                     'Lag':key,
                     'Correlation':value})

    df_correlations = pd.DataFrame(data)
    df_correlations.sort_values('Correlation', ascending = False, inplace = True)

    # Select only top and bottom 5 correlations
    df_top_corr = df_correlations.head(5)
    df_bottom_corr = df_correlations.tail(5)
    df_final = pd.concat([df_top_corr, df_bottom_corr])

    # Create table
    plt.figure(figsize = (10,5))
    plt.axis('off')
    plt.axis('tight')
    table = plt.table(cellText = df_final.values, colLabels=df_final.columns, loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2,1.2)
    plt.title(f'Top Correlations for {symbol}')
    plt.tight_layout()
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f"{save_path}top_correlations_{symbol}.png")
    plt.close()  # Close the figure to free up memory

    print(f"Top correlation table for {symbol} saved to {save_path}top_correlations_{symbol}.png")

def create_intraday_sentiment_chart(sentiment_data_path, save_path = 'visuals/'):
  """
    Generates and saves a chart of intraday sentiment patterns.

    Parameters:
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
    """
  # Load sentiment data and calculate intraday patterns
  sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
  sentiment_df['minute'] = pd.to_datetime(sentiment_df['minute'])
  sentiment_df['hour'] = sentiment_df['minute'].dt.hour

  hourly_stats = sentiment_df.groupby('hour').agg({
      'mean_pos': ['mean', 'std'],
      'obs_count': 'mean'
  }).round(3)

  # Create the chart
  plt.figure(figsize=(12, 6))
  plt.plot(hourly_stats.index, hourly_stats['mean_pos']['mean'], label = 'Mean Pos Sentiment')
  plt.fill_between(hourly_stats.index,
                   hourly_stats['mean_pos']['mean'] + hourly_stats['mean_pos']['std'],
                   hourly_stats['mean_pos']['mean'] - hourly_stats['mean_pos']['std'],
                   alpha = 0.3)
  plt.xlabel("Hour of Day")
  plt.ylabel("Mean Positive Sentiment")
  plt.title("Intraday Sentiment Patterns")
  plt.legend()
  plt.grid(True)
  plt.xticks(range(0,24))
  plt.tight_layout()
    # Save heatmap
  os.makedirs(save_path, exist_ok=True)
  plt.savefig(f'{save_path}intraday_sentiment_patterns.png')
  plt.close()
  print(f"Intraday sentiment chart saved to {save_path}intraday_sentiment_patterns.png")
    
# Define paths
price_data_path = "price_data/"
sentiment_data_path = "cleaned_data/"
save_path = "visuals/"
# Create the save path if it doesn't exist
os.makedirs(save_path, exist_ok = True)

# Define symbols and intervals
symbols = ['BTCUSDT', 'ETHUSDT', 'DOGEUSDT']
price_intervals = ['1m', '1h', '1d']
sentiment_intervals = ['1m', '1h', '1d']

# Generate heatmaps
for symbol in symbols:
  create_correlation_heatmap(symbol, price_intervals, sentiment_intervals, price_data_path, sentiment_data_path)
# Generate top correlation tables
for symbol in symbols:
    create_top_correlations_table(symbol, price_data_path, sentiment_data_path)
# Generate the intraday sentiment chart
create_intraday_sentiment_chart(sentiment_data_path)

Heatmap for BTCUSDT saved to visuals/BTCUSDT_correlation_heatmap.png
Heatmap for ETHUSDT saved to visuals/ETHUSDT_correlation_heatmap.png


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Heatmap for DOGEUSDT saved to visuals/DOGEUSDT_correlation_heatmap.png
Top correlation table for BTCUSDT saved to visuals/top_correlations_BTCUSDT.png
Top correlation table for ETHUSDT saved to visuals/top_correlations_ETHUSDT.png


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Top correlation table for DOGEUSDT saved to visuals/top_correlations_DOGEUSDT.png
Intraday sentiment chart saved to visuals/intraday_sentiment_patterns.png


In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os


def analyze_price_sentiment_relationship(price_df, sentiment_df, symbol, interval):
    """
    Analyze the relationship between price movements and sentiment
    
    Parameters:
    - price_df: DataFrame with price data
    - sentiment_df: DataFrame with sentiment data
    - symbol: crypto symbol (e.g., 'BTC')
    - interval: '1m', '1h', '1d'

    Returns:
    - correlations: A dict where keys are lags and values are the correlation coefficients

    """

    # Merge price and sentiment data
    merged_df = pd.merge(
      price_df,
      sentiment_df,
      on='time',
      how='inner'
    )
    
    # Calculate price returns
    merged_df['ret'] = merged_df['close'].pct_change()
    
    # Calculate correlations at different lags
    correlations = {}
    if interval == '1m':
      for lag in range(-5, 6):  # -5 to +5 minutes
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}m'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}m'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1h':
      for lag in range(-2, 3):  # -2 to +2 hours
            if lag < 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(abs(lag)))
                correlations[f'sentiment_leads_{abs(lag)}h'] = corr
            elif lag > 0:
                corr = merged_df['mean_pos'].corr(merged_df['ret'].shift(-lag))
                correlations[f'price_leads_{lag}h'] = corr
            else:
                corr = merged_df['mean_pos'].corr(merged_df['ret'])
                correlations['concurrent'] = corr
    elif interval == '1d':
        corr = merged_df['mean_pos'].corr(merged_df['ret'])
        correlations['concurrent'] = corr

    return correlations


def create_correlation_heatmap(symbol, price_intervals, sentiment_intervals, price_data_path, sentiment_data_path, save_path='visuals/'):
    """
    Generates and saves correlation heatmaps for different time intervals.

    Parameters:
        symbol (str): Cryptocurrency symbol ('BTCUSDT', 'ETHUSDT', 'DOGEUSDT').
        price_intervals (list): List of price intervals ('1m', '1h', '1d').
        sentiment_intervals (list): List of sentiment intervals ('1m', '1h', '1d').
        price_data_path (str): Path to the directory with price data CSVs.
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
    """
    
    all_correlations = {}
    all_data = []
    for price_interval in price_intervals:
        all_correlations[price_interval] = {}
        # Load price data
        price_df = pd.read_csv(f'{price_data_path}{symbol}_{price_interval}.csv')
        price_df['time'] = pd.to_datetime(price_df['time'])


        for sentiment_interval in sentiment_intervals:
            # Load sentiment data
            if sentiment_interval == '1m':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['minute'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1h':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_hour.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1d':
                sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_daily.csv')
                sentiment_df['time'] = pd.to_datetime(sentiment_df['date'])
                sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
                
            # Analyze relationships
            correlations = analyze_price_sentiment_relationship(price_df, sentiment_df, symbol.replace('USDT', ''), price_interval)
            
            all_correlations[price_interval][sentiment_interval] = correlations

    # Convert to DataFrame for Heatmap
    
    for price_interval, sent_intervals in all_correlations.items():
      for sent_interval, corrs in sent_intervals.items():
        for key, value in corrs.items():
          all_data.append({'Price Interval':price_interval,
                       'Sentiment Interval':sent_interval,
                       'Lag':key,
                       'Correlation':value})

    df_heatmap = pd.DataFrame(all_data)

    # Pivot the dataframe to create the heatmap
    df_heatmap_pivot = df_heatmap.pivot_table(index = ['Price Interval', 'Lag'], columns='Sentiment Interval', values='Correlation')

    # Create the heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_heatmap_pivot, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(f"Correlation Heatmap for {symbol}")
    plt.tight_layout()

    # Save heatmap
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f'{save_path}{symbol}_correlation_heatmap.png')
    plt.close()  # Close the figure to free up memory
    df_heatmap.to_csv(f'{save_path}{symbol}_heatmap_data.csv', index=False)

    print(f"Heatmap for {symbol} saved to {save_path}{symbol}_correlation_heatmap.png")
    print(f"Heatmap data for {symbol} saved to {save_path}{symbol}_heatmap_data.csv")

def create_top_correlations_table(symbol, price_data_path, sentiment_data_path, save_path='visuals/'):
    """
    Generates a table of the top 5 positive and negative correlations between price and sentiment
    at the daily level, and saves it to a png file.

    Parameters:
        symbol (str): Cryptocurrency symbol ('BTCUSDT', 'ETHUSDT', 'DOGEUSDT').
        price_data_path (str): Path to the directory with price data CSVs.
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
        save_path (str): Path to save the output.
    """
    # Load data
    price_interval = '1d'
    price_df = pd.read_csv(f'{price_data_path}{symbol}_{price_interval}.csv')
    price_df['time'] = pd.to_datetime(price_df['time'])
    all_correlations = {}
    for sentiment_interval in ['1m', '1h', '1d']:
            # Load sentiment data
            if sentiment_interval == '1m':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['minute'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1h':
              sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_hour.csv')
              sentiment_df['time'] = pd.to_datetime(sentiment_df['time'])
              sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
            elif sentiment_interval == '1d':
                sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_daily.csv')
                sentiment_df['time'] = pd.to_datetime(sentiment_df['date'])
                sentiment_df = sentiment_df[sentiment_df['ticker'] == symbol.replace('USDT', '')]
                
            # Analyze relationships
            correlations = analyze_price_sentiment_relationship(price_df, sentiment_df, symbol.replace('USDT', ''), price_interval)
            
            all_correlations[sentiment_interval] = correlations

    # Convert to DataFrame for Heatmap
    data = []
    for sent_interval, corrs in all_correlations.items():
      for key, value in corrs.items():
        data.append({'Sentiment Interval':sent_interval,
                     'Lag':key,
                     'Correlation':value})

    df_correlations = pd.DataFrame(data)
    df_correlations.sort_values('Correlation', ascending = False, inplace = True)

    # Select only top and bottom 5 correlations
    df_top_corr = df_correlations.head(5)
    df_bottom_corr = df_correlations.tail(5)
    df_final = pd.concat([df_top_corr, df_bottom_corr])
    df_final.to_csv(f"{save_path}top_correlations_{symbol}_data.csv", index=False)

    # Create table
    plt.figure(figsize = (10,5))
    plt.axis('off')
    plt.axis('tight')
    table = plt.table(cellText = df_final.values, colLabels=df_final.columns, loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2,1.2)
    plt.title(f'Top Correlations for {symbol}')
    plt.tight_layout()
    os.makedirs(save_path, exist_ok=True)
    plt.savefig(f"{save_path}top_correlations_{symbol}.png")
    plt.close()  # Close the figure to free up memory

    print(f"Top correlation table for {symbol} saved to {save_path}top_correlations_{symbol}.png")
    print(f"Top correlation data for {symbol} saved to {save_path}top_correlations_{symbol}_data.csv")

def create_intraday_sentiment_chart(sentiment_data_path, save_path = 'visuals/'):
  """
    Generates and saves a chart of intraday sentiment patterns.

    Parameters:
        sentiment_data_path (str): Path to the directory with sentiment data CSVs.
    """
  # Load sentiment data and calculate intraday patterns
  sentiment_df = pd.read_csv(f'{sentiment_data_path}ticker_sentiment_minute.csv')
  sentiment_df['minute'] = pd.to_datetime(sentiment_df['minute'])
  sentiment_df['hour'] = sentiment_df['minute'].dt.hour

  hourly_stats = sentiment_df.groupby('hour').agg({
      'mean_pos': ['mean', 'std'],
      'obs_count': 'mean'
  }).round(3)
  hourly_stats.reset_index(inplace=True)

  # Create the chart
  plt.figure(figsize=(12, 6))
  plt.plot(hourly_stats['hour'], hourly_stats['mean_pos']['mean'], label = 'Mean Pos Sentiment')
  plt.fill_between(hourly_stats['hour'],
                   hourly_stats['mean_pos']['mean'] + hourly_stats['mean_pos']['std'],
                   hourly_stats['mean_pos']['mean'] - hourly_stats['mean_pos']['std'],
                   alpha = 0.3)
  plt.xlabel("Hour of Day")
  plt.ylabel("Mean Positive Sentiment")
  plt.title("Intraday Sentiment Patterns")
  plt.legend()
  plt.grid(True)
  plt.xticks(range(0,24))
  plt.tight_layout()
    # Save heatmap
  os.makedirs(save_path, exist_ok=True)
  plt.savefig(f'{save_path}intraday_sentiment_patterns.png')
  plt.close()

  hourly_stats.to_csv(f'{save_path}intraday_sentiment_patterns_data.csv', index=False)

  print(f"Intraday sentiment chart saved to {save_path}intraday_sentiment_patterns.png")
  print(f"Intraday sentiment chart data saved to {save_path}intraday_sentiment_patterns_data.csv")
    
# Define paths
price_data_path = "price_data/"
sentiment_data_path = "cleaned_data/"
save_path = "visuals/"
# Create the save path if it doesn't exist
os.makedirs(save_path, exist_ok = True)

# Define symbols and intervals
symbols = ['BTCUSDT', 'ETHUSDT', 'DOGEUSDT']
price_intervals = ['1m', '1h', '1d']
sentiment_intervals = ['1m', '1h', '1d']

# Generate heatmaps
for symbol in symbols:
  create_correlation_heatmap(symbol, price_intervals, sentiment_intervals, price_data_path, sentiment_data_path)
# Generate top correlation tables
for symbol in symbols:
    create_top_correlations_table(symbol, price_data_path, sentiment_data_path)
# Generate the intraday sentiment chart
create_intraday_sentiment_chart(sentiment_data_path)

Heatmap for BTCUSDT saved to visuals/BTCUSDT_correlation_heatmap.png
Heatmap data for BTCUSDT saved to visuals/BTCUSDT_heatmap_data.csv
Heatmap for ETHUSDT saved to visuals/ETHUSDT_correlation_heatmap.png
Heatmap data for ETHUSDT saved to visuals/ETHUSDT_heatmap_data.csv


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Heatmap for DOGEUSDT saved to visuals/DOGEUSDT_correlation_heatmap.png
Heatmap data for DOGEUSDT saved to visuals/DOGEUSDT_heatmap_data.csv
Top correlation table for BTCUSDT saved to visuals/top_correlations_BTCUSDT.png
Top correlation data for BTCUSDT saved to visuals/top_correlations_BTCUSDT_data.csv
Top correlation table for ETHUSDT saved to visuals/top_correlations_ETHUSDT.png
Top correlation data for ETHUSDT saved to visuals/top_correlations_ETHUSDT_data.csv


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Top correlation table for DOGEUSDT saved to visuals/top_correlations_DOGEUSDT.png
Top correlation data for DOGEUSDT saved to visuals/top_correlations_DOGEUSDT_data.csv
Intraday sentiment chart saved to visuals/intraday_sentiment_patterns.png
Intraday sentiment chart data saved to visuals/intraday_sentiment_patterns_data.csv


In [13]:
# Install the 'ta' package
%pip install ta


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import ta
from typing import Dict, List
from datetime import datetime

class BaseStrategy:
    def __init__(self):
        self.lookback = 20
        self.volume_threshold = 1.2
        self.position_size = 0.2  # Base position size 20%
        
    def calculate_base_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate common indicators for all strategies"""
        # Price indicators
        df['price_ma'] = df['close'].rolling(self.lookback).mean()
        df['price_std'] = df['close'].rolling(self.lookback).std()
        df['price_momentum'] = df['close'].pct_change(3)
        
        # Volume indicators
        df['volume_ma'] = df['volume'].rolling(self.lookback).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']
        
        # Trend strength
        df['trend_strength'] = abs(df['price_momentum'].rolling(5).mean())
        
        return df

class BTCStrategy(BaseStrategy):
    def __init__(self):
        super().__init__()
        self.stop_loss = 0.02      # 2% stop loss
        self.take_profit = 0.03    # 3% take profit
        self.max_position = 2000   # Maximum $2000 per trade
        
    def generate_signals(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.calculate_base_indicators(df)
        df['signal'] = 0
        
        # Calculate specific indicators
        df['rsi'] = ta.momentum.RSIIndicator(df['close']).rsi()
        df['macd'] = ta.trend.MACD(df['close']).macd()
        
        # Long conditions (Multiple confirmations)
        long_conditions = (
            (df['close'] > df['price_ma']) &          # Price above MA
            (df['rsi'] < 65) &                        # Not overbought
            (df['macd'] > 0) &                        # MACD positive
            (df['volume_ratio'] > self.volume_threshold)  # Good volume
        )
        
        # Short conditions
        short_conditions = (
            (df['close'] < df['price_ma']) &          # Price below MA
            (df['rsi'] > 35) &                        # Not oversold
            (df['macd'] < 0) &                        # MACD negative
            (df['volume_ratio'] > self.volume_threshold)  # Good volume
        )
        
        df.loc[long_conditions, 'signal'] = 1
        df.loc[short_conditions, 'signal'] = -1
        
        return df

class ETHStrategy(BaseStrategy):
    def __init__(self):
        super().__init__()
        self.stop_loss = 0.015     # 1.5% stop loss
        self.take_profit = 0.025   # 2.5% take profit
        self.max_position = 1500   # Maximum $1500 per trade
        
    def generate_signals(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.calculate_base_indicators(df)
        df['signal'] = 0
        
        # Calculate specific indicators
        df['ema_fast'] = ta.trend.EMAIndicator(df['close'], window=10).ema_indicator()
        df['ema_slow'] = ta.trend.EMAIndicator(df['close'], window=21).ema_indicator()
        df['atr'] = ta.volatility.AverageTrueRange(df['high'], df['low'], df['close']).average_true_range()
        
        # Long conditions (more lenient)
        long_conditions = (
            (df['ema_fast'] > df['ema_slow']) &
            (df['close'] > df['price_ma']) &
            (df['volume_ratio'] > self.volume_threshold)
        )
        
        # Short conditions (more lenient)
        short_conditions = (
            (df['ema_fast'] < df['ema_slow']) &
            (df['close'] < df['price_ma']) &
            (df['volume_ratio'] > self.volume_threshold)
        )
        
        df.loc[long_conditions, 'signal'] = 1
        df.loc[short_conditions, 'signal'] = -1
        
        return df

class DOGEStrategy(BaseStrategy):
    def __init__(self):
        super().__init__()
        self.stop_loss = 0.005       # 0.5% stop loss
        self.take_profit = 0.015     # 1.5% take profit
        self.max_position = 500      # Smaller position size
        self.lookback = 20           # Longer lookback to identify sentiment patterns
        
    def generate_signals(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.calculate_base_indicators(df)
        df['signal'] = 0
        
        # Volume patterns
        df['volume_ma'] = df['volume'].rolling(20).mean()
        df['volume_ratio'] = df['volume'] / df['volume_ma']
        
        # Price patterns
        df['high_low_range'] = (df['high'] - df['low']) / df['low']
        df['price_velocity'] = df['close'].pct_change(3)
        df['price_acceleration'] = df['price_velocity'].diff()
        
        # Volatility
        df['volatility'] = df['close'].pct_change().rolling(5).std()
        df['volatility_ma'] = df['volatility'].rolling(20).mean()
        
        # Identify potential bottoms (based on Very Low sentiment success)
        df['lower_band'] = df['close'].rolling(10).min()
        df['upper_band'] = df['close'].rolling(10).max()
        df['price_position'] = (df['close'] - df['lower_band']) / (df['upper_band'] - df['lower_band'])
        
        # Buy signals - looking for oversold conditions
        long_conditions = (
            (df['price_position'] < 0.2) &                   # Price near recent lows
            (df['volume_ratio'] > 1.5) &                     # Above average volume
            (df['volatility'] < df['volatility_ma']) &       # Lower volatility
            (df['price_acceleration'] > 0)                   # Momentum turning positive
        )
        
        # Sell signals - take profits on strength
        short_conditions = (
            (df['price_position'] > 0.8) &                  # Price near recent highs
            (df['volume_ratio'] > 1.5) &                    # Above average volume
            (df['volatility'] > df['volatility_ma'] * 1.2)  # Higher volatility
        )
        
        # Apply signals
        df.loc[long_conditions, 'signal'] = 1
        df.loc[short_conditions, 'signal'] = -1
        
        # Dynamic position sizing based on confidence
        df['position_size'] = 0.0
        
        # Base position size inversely proportional to price position
        df.loc[long_conditions, 'position_size'] = 0.05 + (0.05 * (1 - df['price_position']))
        df.loc[short_conditions, 'position_size'] = 0.05 + (0.05 * df['price_position'])
        
        # Adjust for volume confirmation
        df.loc[df['signal'] != 0, 'position_size'] *= df['volume_ratio'].clip(0.5, 2.0)
        
        # Final position size limits
        df['position_size'] = df['position_size'].clip(0.05, 0.15)
        
        # Exit conditions
        exit_conditions = (
            (df['volatility'] > df['volatility_ma'] * 2) |  # Extreme volatility
            (df['volume_ratio'] < 0.5)                      # Volume dying
        )
        df.loc[exit_conditions & (df['signal'] != 0), 'signal'] = 0
        
        return df

def backtest_strategy(df: pd.DataFrame, strategy: BaseStrategy, initial_capital: float = 10000.0) -> Dict:
    """Backtest a strategy with proper position sizing and risk management"""
    balance = initial_capital
    position = 0
    entry_price = 0
    entry_quantity = 0
    trades = []
    equity_curve = [initial_capital]
    
    for i in range(1, len(df)):
        current_price = df['close'].iloc[i]
        signal = df['signal'].iloc[i]
        date = pd.to_datetime(df['timestamp'].iloc[i])
        
        # Check stop loss and take profit if in position
        if position != 0:
            pnl_pct = (current_price - entry_price) / entry_price * position
            if pnl_pct <= -strategy.stop_loss or pnl_pct >= strategy.take_profit:
                # Close position with proper risk management
                pnl = (current_price - entry_price) * entry_quantity * position
                balance += pnl
                position = 0
                trades.append({
                    'date': date,
                    'type': 'close',
                    'price': current_price,
                    'pnl': pnl,
                    'balance': balance,
                    'reason': 'sl_tp'
                })
        
        # Process signals
        if position == 0 and signal != 0:
            # Calculate position size based on risk
            risk_amount = balance * strategy.stop_loss
            max_trade_amount = min(balance * strategy.position_size, strategy.max_position)
            position_amount = min(max_trade_amount, risk_amount / strategy.stop_loss)
            
            # Calculate quantity
            quantity = position_amount / current_price
            
            position = signal
            entry_price = current_price
            entry_quantity = quantity
            
            trades.append({
                'date': date,
                'type': 'open',
                'price': current_price,
                'quantity': quantity,
                'amount': position_amount,
                'balance': balance
            })
        
        # Update equity curve
        current_equity = balance
        if position != 0:
            unrealized_pnl = (current_price - entry_price) * entry_quantity * position
            current_equity += unrealized_pnl
        equity_curve.append(current_equity)
    
    # Calculate performance metrics
    returns = pd.Series(equity_curve).pct_change().dropna()
    
    # Calculate max drawdown safely
    def calculate_max_drawdown(equity_series):
        if not equity_series:
            return 0
        rolling_max = pd.Series(equity_series).expanding().max()
        drawdowns = pd.Series(equity_series) / rolling_max - 1
        return drawdowns.min() * 100
    
    return {
        'final_balance': equity_curve[-1],
        'total_return': (equity_curve[-1] / initial_capital - 1) * 100,
        'sharpe_ratio': np.sqrt(252) * returns.mean() / returns.std() if len(returns) > 0 and returns.std() != 0 else 0, # Annualized Sharpe ratio for daily data
        'max_drawdown': calculate_max_drawdown(equity_curve),
        'win_rate': len([t for t in trades if t.get('pnl', 0) > 0]) / len([t for t in trades if 'pnl' in t]) * 100 if trades else 0,
        'trades': trades,
        'equity_curve': equity_curve
    }

def run_strategies(price_data_path: str):
    """Run all strategies with detailed reporting"""
    strategies = {
        'BTCUSDT': BTCStrategy(),
        'ETHUSDT': ETHStrategy(),
        'DOGEUSDT': DOGEStrategy()
    }
    
    for symbol, strategy in strategies.items():
        print(f"\n{'='*50}")
        print(f"Processing {symbol}")
        
        # Load data
        df = pd.read_csv(f"{price_data_path}{symbol}_1d.csv")
        df = strategy.generate_signals(df)
        
        # Run backtest
        results = backtest_strategy(df, strategy)
        
        # Print results
        print(f"\nResults for {symbol}:")
        print(f"Initial Balance: $10,000.00")
        print(f"Final Balance: ${results['final_balance']:.2f}")
        print(f"Total Return: {results['total_return']:.2f}%")
        print(f"Sharpe Ratio: {results['sharpe_ratio']:.2f}")
        print(f"Max Drawdown: {results['max_drawdown']:.2f}%")
        print(f"Win Rate: {results['win_rate']:.2f}%")
        print(f"Number of trades: {len(results['trades'])}")
        
        if results['trades']:
            print("\nFirst 3 trades:")
            for trade in results['trades'][:3]:
                print(trade)
        
# Run the analysis
price_data_path = "price_data/"
run_strategies(price_data_path)


Processing BTCUSDT

Results for BTCUSDT:
Initial Balance: $10,000.00
Final Balance: $10562.24
Total Return: 5.62%
Sharpe Ratio: 1.21
Max Drawdown: -1.67%
Win Rate: 60.87%
Number of trades: 47

First 3 trades:
{'date': Timestamp('2024-03-20 00:00:00'), 'type': 'open', 'price': np.float64(67840.51), 'quantity': np.float64(0.029480910447164978), 'amount': 2000.0, 'balance': 10000.0}
{'date': Timestamp('2024-03-21 00:00:00'), 'type': 'close', 'price': np.float64(65501.27), 'pnl': np.float64(-68.96292495442614), 'balance': np.float64(9931.037075045573), 'reason': 'sl_tp'}
{'date': Timestamp('2024-04-15 00:00:00'), 'type': 'open', 'price': np.float64(63419.99), 'quantity': np.float64(0.03131831800996996), 'amount': np.float64(1986.2074150091148), 'balance': np.float64(9931.037075045573)}

Processing ETHUSDT

Results for ETHUSDT:
Initial Balance: $10,000.00
Final Balance: $10902.26
Total Return: 9.02%
Sharpe Ratio: 1.44
Max Drawdown: -1.99%
Win Rate: 53.85%
Number of trades: 104

First 3 tra