# Notebook 03 â€“ News & Social Media Sentiment Generation
## Synthetic but Realistic Multimodal Data for EGX Stocks

This notebook generates **synthetic but realistic** financial news and social media sentiment data
for the following Egyptian stocks:
- COMI (Commercial International Bank)
- AMOC (Alexandria Mineral Oils Company)
- SWDY (Elsewedy Electric)

The generated data is **academically acceptable**, reproducible, and aligned with historical price movements.


In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

BASE_DIR = '/content/drive/MyDrive/finrl-egx-multimodal'
NEWS_DIR = os.path.join(BASE_DIR, 'data/news')
SENTIMENT_DIR = os.path.join(BASE_DIR, 'data/sentiment')

os.makedirs(NEWS_DIR, exist_ok=True)
os.makedirs(SENTIMENT_DIR, exist_ok=True)


In [None]:
# Load processed stock data to align sentiment with returns
stocks = ['COMI', 'AMOC', 'SWDY']
price_data = {}

for stock in stocks:
    path = os.path.join(BASE_DIR, 'data/stocks_processed', f'{stock}_processed.csv')
    df = pd.read_csv(path)
    df['Date'] = pd.to_datetime(df['Date'])
    price_data[stock] = df[['Date', 'daily_return']]


In [None]:
# News headline templates
positive_news = [
    'Company reports strong financial performance',
    'Positive outlook driven by sector growth',
    'Earnings exceed market expectations'
]

negative_news = [
    'Company faces short-term operational challenges',
    'Market concerns over declining margins',
    'Earnings disappoint amid economic pressure'
]

neutral_news = [
    'Company maintains stable operations',
    'No major changes reported in latest update',
    'Business performance remains in line with forecasts'
]

In [None]:
def generate_news_and_sentiment(stock):
    df = price_data[stock].copy()
    news_records = []
    sentiment_records = []

    for _, row in df.iterrows():
        date = row['Date']
        ret = row['daily_return']

        # Sentiment correlated with return (no look-ahead)
        sentiment = np.clip(np.random.normal(ret * 5, 0.2), -1, 1)
        sentiment_records.append([date, sentiment])

        # Generate news occasionally
        if np.random.rand() < 0.3:
            if sentiment > 0.1:
                headline = np.random.choice(positive_news)
            elif sentiment < -0.1:
                headline = np.random.choice(negative_news)
            else:
                headline = np.random.choice(neutral_news)

            news_records.append([date, headline, sentiment])

    news_df = pd.DataFrame(news_records, columns=['date', 'headline', 'sentiment'])
    sentiment_df = pd.DataFrame(sentiment_records, columns=['date', 'sentiment'])

    return news_df, sentiment_df

In [None]:
# Generate and save data for each stock
for stock in stocks:
    print(f'Generating news and sentiment for {stock}...')
    news_df, sentiment_df = generate_news_and_sentiment(stock)

    news_path = os.path.join(NEWS_DIR, f'{stock}_news.csv')
    sentiment_path = os.path.join(SENTIMENT_DIR, f'{stock}_sentiment.csv')

    news_df.to_csv(news_path, index=False)
    sentiment_df.to_csv(sentiment_path, index=False)

    print(f'Saved news to {news_path}')
    print(f'Saved sentiment to {sentiment_path}')

In [None]:
# Preview generated data
pd.read_csv(os.path.join(NEWS_DIR, 'COMI_news.csv')).head(), \
pd.read_csv(os.path.join(SENTIMENT_DIR, 'COMI_sentiment.csv')).head()