In [None]:
!pip install yfinance faker --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25h

### Step 1: Imports and Configuration

In [None]:
import yfinance as yf              # Download historical stock data
import pandas as pd               # Handle structured data (tables)
import numpy as np                # Numerical operations
from faker import Faker           # Generate realistic timestamps
from datetime import datetime, timedelta
import random                     # Add randomness
import time                       # Handle delays and retries

# Configuration parameters
SYMBOL = "AAPL"                    # The stock symbol to simulate tweets about
START_DATE = "2015-01-01"
END_DATE = "2025-05-30"
OUTPUT_FILE = "synthetic_financial_tweets_labeled.csv"
TWEETS_PER_DAY = 20               # Base number of tweets per day
EVENT_PROB = 0.05                 # 5% probability of a special market event
fake = Faker()                    # Initialize faker for timestamps

### Step 2: Generate Random Timestamps During Trading Hours

In [None]:
def generate_timestamps(date, count):
    if isinstance(date, pd.Timestamp):
        date = date.to_pydatetime()
    open_time = datetime.combine(date, datetime.min.time()) + timedelta(hours=9, minutes=30)
    close_time = open_time + timedelta(hours=6, minutes=30)
    return sorted([
        fake.date_time_between_dates(datetime_start=open_time, datetime_end=close_time)
        for _ in range(count)
    ])

### Step 3: Determine Sentiment Based on Market Movement and Trend

In [None]:
def determine_sentiment_label(row, intraday_pct):
    open_price = float(row['Open'])
    ma_50_prev = float(row['MA_50_prev'])  # Previous day's 50-day moving average
    volatility = float(row['Volatility'])

    current_price = open_price * (1 + intraday_pct / 100)
    above_ma = current_price > ma_50_prev

    if intraday_pct > 1.5: return 1
    elif intraday_pct < -1.5: return -1
    elif intraday_pct > 0.5 and above_ma: return 1
    elif intraday_pct < -0.5 and not above_ma: return -1
    elif volatility > 0.02: return 0
    else: return 0

### Step 4: Add Controlled Randomness to Sentiment Label

In [None]:
def noisy_sentiment_label(true_label, volatility):
    base_error = min(0.3, float(volatility) * 10)
    probs = {
        1: [1 - base_error, base_error / 2, base_error / 2],
        0: [base_error / 2, 1 - base_error, base_error / 2],
        -1: [base_error / 2, base_error / 2, 1 - base_error]
    }
    return random.choices([1, 0, -1], weights=probs[true_label])[0]

###  Step 5: Define Tweet Templates for Sentiment and Events

In [None]:
TEMPLATES = {
    1: ["🚀 ${symbol} rally continues: up {pct:.2f}% 📈 #bullish",
        "📢 Big day for ${symbol}: +{pct:.2f}% on Fed optimism",
        "📊 ${symbol} spikes after positive earnings: +{pct:.2f}%"],
    0: ["🔍 ${symbol} stable today (±{pct:.2f}%). Traders wait for direction.",
        "Market quiet. ${symbol} sees little action: ±{pct:.2f}%",
        "Consolidation in ${symbol} as investors hold positions steady."],
    -1: ["📉 ${symbol} tumbles {pct:.2f}% amid inflation fears 😟",
         "🚨 Red day: ${symbol} down {pct:.2f}% as recession talk grows",
         "${symbol} sees heavy selling. Dropped {pct:.2f}% today."]
}

EVENT_TEMPLATES = {
    "earnings": {
        1: ["🔥 ${symbol} EPS BEAT! Up {pct:.2f}% AH on strong guidance",
            "🚀 ${symbol} surges {pct:.2f}% after crushing earnings estimates"],
        0: ["${symbol} earnings met expectations. Flat trading AH",
            "Mixed reaction to ${symbol} results. Shares unchanged"],
        -1: ["⚠️ ${symbol} misses earnings! Down {pct:.2f}% on weak outlook",
             "📉 ${symbol} tumbles {pct:.2f}% after earnings disaster"]
    },
    "fed": {
        1: ["📈 ${symbol} jumps {pct:.2f}% on dovish Fed comments",
            "Bull run! Fed holds rates steady. ${symbol} up {pct:.2f}%"],
        -1: ["💥 Hawkish Fed tanks markets! ${symbol} down {pct:.2f}%",
             "Rate hike fears: ${symbol} plunges {pct:.2f}% after Fed meeting"]
    },
    "cpi": {
        1: ["🎉 Inflation cools! ${symbol} rallies {pct:.2f}% on CPI report",
            "CPI better than expected. ${symbol} gains {pct:.2f}%"],
        -1: ["🔥 Hot CPI data! ${symbol} crashes {pct:.2f}% on inflation fears",
             "Stagflation worries: ${symbol} down {pct:.2f}% after CPI print"]
    }
}

### Step 6: Simulate Intraday Price Movement

In [None]:
def calculate_intraday_return(daily_return, time_fraction):
    intensity = 4 * (time_fraction - 0.5)**2 + 0.2
    return float(daily_return) * intensity * random.uniform(0.8, 1.2)

### Step 7: Randomly Generate Market Events

In [None]:
def generate_market_event(date):
    if random.random() > EVENT_PROB:
        return None
    if date.month in [1, 4, 7, 10] and random.random() > 0.3:
        return "earnings"
    elif date.month in [3, 6, 9, 12]:
        return "fed" if random.random() > 0.4 else "cpi"
    else:
        return random.choice(["cpi", None])

### Step 8: Main Pipeline

In [None]:
def main():
    # Download historical price data
    for _ in range(3):
        try:
            data = yf.download(SYMBOL, start=START_DATE, end=END_DATE, progress=False, auto_adjust=False)
            if not data.empty:
                break
        except Exception as e:
            print(f"Retry: {e}")
            time.sleep(2)

    if data is None or data.empty:
        print("Failed to get data.")
        return

    # Preprocess features
    data['Return'] = data['Close'].pct_change() * 100
    data['Volatility'] = (data['High'] - data['Low']) / data['Open']
    data['MA_50_prev'] = data['Close'].rolling(50).mean().shift(1)
    data.dropna(inplace=True)

    all_tweets = []

    for date, row in data.iterrows():
        event = generate_market_event(date)
        event_multiplier = 3 if event else 1

        volatility_component = int(float(row['Volatility']) * 1000)
        return_component = int(abs(float(row['Return'])))
        tweet_count = TWEETS_PER_DAY + volatility_component * event_multiplier + return_component

        try:
            timestamps = generate_timestamps(date, tweet_count)
        except Exception as e:
            print(f"Timestamp error: {e}")
            continue

        for ts in timestamps:
            hours_open = (ts.hour - 9.5) + (ts.minute / 60)
            time_fraction = hours_open / 6.5
            intraday_pct = calculate_intraday_return(row['Return'], time_fraction)
            true_label = determine_sentiment_label(row, intraday_pct)
            final_label = noisy_sentiment_label(true_label, row['Volatility'])

            if event and random.random() > 0.7:
                template_pool = EVENT_TEMPLATES[event].get(final_label, TEMPLATES[final_label])
            else:
                template_pool = TEMPLATES[final_label]

            tweet_text = random.choice(template_pool).format(symbol=SYMBOL, pct=abs(intraday_pct))

            all_tweets.append({
                "timestamp": ts.isoformat(),
                "text": tweet_text,
                "symbol": SYMBOL,
                "daily_return": float(row['Return']),
                "intraday_return": intraday_pct,
                "volatility": float(row['Volatility']),
                "volume": float(row["Volume"]),
                "true_label": true_label,
                "final_label": final_label,
                "event": event if event else "",
                "date": date.strftime('%Y-%m-%d')
            })

    df = pd.DataFrame(all_tweets)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✅ {len(df)} tweets saved to {OUTPUT_FILE}")
    print(f"From {df['date'].min()} to {df['date'].max()} | Events: {len(df[df['event'] != ''])}")

# Run the pipeline
main()

  volatility_component = int(float(row['Volatility']) * 1000)
  return_component = int(abs(float(row['Return'])))
  return float(daily_return) * intensity * random.uniform(0.8, 1.2)
  open_price = float(row['Open'])
  volatility = float(row['Volatility'])
  "daily_return": float(row['Return']),
  "volatility": float(row['Volatility']),
  "volume": float(row["Volume"]),


✅ 108388 tweets saved to synthetic_financial_tweets_labeled.csv
From 2015-03-17 to 2025-05-29 | Events: 8372


In [None]:
import pandas as pd

df = pd.read_csv("synthetic_financial_tweets_labeled.csv")

df.head(10)

Unnamed: 0,timestamp,text,symbol,daily_return,intraday_return,volatility,volume,true_label,final_label,event,date
0,2015-03-17T10:17:37.004900,📊 $AAPL spikes after positive earnings: +1.35%,AAPL,1.672672,1.351033,0.013264,204092400.0,1,1,,2015-03-17
1,2015-03-17T10:24:49.233734,📊 $AAPL spikes after positive earnings: +1.03%,AAPL,1.672672,1.031203,0.013264,204092400.0,1,1,,2015-03-17
2,2015-03-17T10:59:22.020299,📢 Big day for $AAPL: +0.95% on Fed optimism,AAPL,1.672672,0.94506,0.013264,204092400.0,1,1,,2015-03-17
3,2015-03-17T10:59:33.652036,📊 $AAPL spikes after positive earnings: +0.75%,AAPL,1.672672,0.749279,0.013264,204092400.0,1,1,,2015-03-17
4,2015-03-17T11:38:39.816551,🔍 $AAPL stable today (±0.48%). Traders wait fo...,AAPL,1.672672,0.483688,0.013264,204092400.0,0,0,,2015-03-17
5,2015-03-17T11:42:00.688413,📢 Big day for $AAPL: +0.52% on Fed optimism,AAPL,1.672672,0.520811,0.013264,204092400.0,1,1,,2015-03-17
6,2015-03-17T11:42:22.316696,📊 $AAPL spikes after positive earnings: +0.53%,AAPL,1.672672,0.525476,0.013264,204092400.0,1,1,,2015-03-17
7,2015-03-17T11:53:38.883884,📢 Big day for $AAPL: +0.54% on Fed optimism,AAPL,1.672672,0.541543,0.013264,204092400.0,1,1,,2015-03-17
8,2015-03-17T11:57:21.775749,🔍 $AAPL stable today (±0.35%). Traders wait fo...,AAPL,1.672672,0.351279,0.013264,204092400.0,0,0,,2015-03-17
9,2015-03-17T12:03:23.594033,Consolidation in $AAPL as investors hold posit...,AAPL,1.672672,0.396537,0.013264,204092400.0,0,0,,2015-03-17
