Meta data extraction using LLM to be fed into RL systems

In [None]:
import sys
import subprocess
import warnings

warnings.filterwarnings('ignore')

print("Installing dependencies...")
packages = ['praw', 'transformers', 'torch', 'pandas', 'numpy']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])

import praw
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import defaultdict


Installing dependencies...


In [None]:
from google.colab import userdata
# ========== REDDIT ==========
CLIENT_ID = "aq-2pxMG9ttgc4SWXu_QTw"
CLIENT_SECRET = "lBnvo65CKWRFbE7XGM8AXkPNXj7X6g"
USER_AGENT = "FinancialSentimentAnalysis/1.0"
USERNAME = "Sad_View_5744"
PASSWORD = "Aditya@2001"

try:
    reddit = praw.Reddit(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        user_agent=USER_AGENT,
        username=USERNAME,
        password=PASSWORD
    )

    # Test connection
    reddit.user.me()
    print("Connected to Reddit successfully!\n")
    USE_REDDIT = True
except Exception as e:
    print(f"ERROR: Could not connect to Reddit: {e}")
    print("Make sure PASSWORD is correct!\n")
    USE_REDDIT = False

subreddit_queries = {
    "SP_500": ["r/stocks", "S&P 500"],
    "NASDAQ": ["r/stocks", "NASDAQ"],
    "Dow_Jones": ["r/stocks", "Dow Jones"],
    "Gold": ["r/investing", "gold prices"],
    "Silver": ["r/investing", "silver prices"],
    "Oil": ["r/investing", "oil prices"]
}

print("STEP 1/5: Collecting posts from Reddit...")

all_posts = defaultdict(list)
total_posts = 0

if USE_REDDIT:
    subreddits_to_check = set(q[0] for q in subreddit_queries.values())

    for subreddit_name in subreddits_to_check:
        print(f"\nFetching from {subreddit_name}...")
        try:
            subreddit = reddit.subreddit(subreddit_name.replace("r/", ""))

            for submission in subreddit.new(limit=100):
                title = submission.title
                body = submission.selftext if submission.selftext else ""
                comments_text = ""

                try:
                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments[:10]:
                        comments_text += comment.body + " "
                except:
                    pass

                content = title + " " + body + " " + comments_text

                for asset, (sub, keywords) in subreddit_queries.items():
                    if sub == subreddit_name:
                        if any(keyword.lower() in content.lower() for keyword in keywords):
                            all_posts[asset].append(content)

            print(f"  Found posts for multiple assets")
        except Exception as e:
            print(f"  Error: {e}")

    total_posts = sum(len(posts) for posts in all_posts.values())
    print(f"\nTotal posts collected: {total_posts}\n")

else:
    print("connection failed\n")




It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Connected to Reddit successfully!

STEP 1/5: Collecting posts from Reddit...

Fetching from r/stocks...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

  Found posts for multiple assets

Fetching from r/investing...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

  Found posts for multiple assets

Total posts collected: 600



In [None]:

# ========== CLEAN & PREPARE DATA ==========

articles_for_sentiment = []
for asset_name, posts in all_posts.items():
    for post in posts:
        if len(post) > 50:
            first_500_words = ' '.join(post.split()[:500])
            volatility_keywords = ['crash', 'surge', 'plunge', 'volatile', 'uncertainty',
                                  'crisis', 'boom', 'panic', 'rally', 'collapse', 'shock',
                                  'extreme', 'risk', 'bull', 'bear']
            volatility_count = sum(1 for word in volatility_keywords if word.lower() in post.lower())

            articles_for_sentiment.append({
                'asset': asset_name,
                'content': first_500_words,
                'volatility_keywords': volatility_count
            })

print(f"Total text samples prepared: {len(articles_for_sentiment)}\n")

Total text samples prepared: 600



In [None]:
# ========== FINBERT ==========

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

if torch.cuda.is_available():
    model = model.to('cuda')
    print("Model loaded on GPU\n")
else:
    print("Model loaded on CPU\n")

# ========== SENTIMENT ANALYSIS ==========
def analyze_sentiment(text):
    if not isinstance(text, str) or len(text) < 10:
        return 'neutral', 0.0, 0.0, 0.0, 0.0

    inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True)
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_idx = torch.argmax(probs, dim=1).item()
    confidence = probs[0][sentiment_idx].item()

    sentiment_map = {0: 'positive', 1: 'negative', 2: 'neutral'}
    return sentiment_map[sentiment_idx], confidence, probs[0][0].item(), probs[0][1].item(), probs[0][2].item()


for idx, article in enumerate(articles_for_sentiment):
    sentiment, conf, p_pos, p_neg, p_neu = analyze_sentiment(article['content'])
    article['sentiment_label'] = sentiment
    article['sentiment_confidence'] = conf
    article['prob_positive'] = p_pos
    article['prob_negative'] = p_neg
    article['prob_neutral'] = p_neu

    if (idx + 1) % 50 == 0 and idx > 0:
        print(f"  Analyzed {idx + 1}/{len(articles_for_sentiment)} texts")

print(f"Sentiment analysis complete\n")


Model loaded on CPU

  Analyzed 50/600 texts
  Analyzed 100/600 texts
  Analyzed 150/600 texts
  Analyzed 200/600 texts
  Analyzed 250/600 texts
  Analyzed 300/600 texts
  Analyzed 350/600 texts
  Analyzed 400/600 texts
  Analyzed 450/600 texts
  Analyzed 500/600 texts
  Analyzed 550/600 texts
  Analyzed 600/600 texts
Sentiment analysis complete



In [None]:

# ========== STEP 5: AGGREGATE FEATURES ==========

daily_features = []
assets = list(set(a['asset'] for a in articles_for_sentiment))

for asset_name in assets:
    asset_articles = [a for a in articles_for_sentiment if a['asset'] == asset_name]

    if len(asset_articles) > 0:
        count_positive = sum(1 for a in asset_articles if a['sentiment_label'] == 'positive')
        count_negative = sum(1 for a in asset_articles if a['sentiment_label'] == 'negative')

        sentiment_score = (count_positive - count_negative) / len(asset_articles) if len(asset_articles) > 0 else 0.0
        sentiment_volatility = np.std([a['prob_positive'] for a in asset_articles]) if len(asset_articles) > 1 else 0.0
        volatility_signal = np.mean([a['volatility_keywords'] for a in asset_articles])
        article_frequency = len(asset_articles)
        sentiment_confidence = np.mean([a['sentiment_confidence'] for a in asset_articles])

        daily_features.append({
            'Asset': asset_name,
            'Date': datetime.now().strftime("%Y-%m-%d"),
            'sentiment_score': sentiment_score,
            'sentiment_volatility': sentiment_volatility,
            'volatility_signal': volatility_signal,
            'article_frequency': article_frequency,
            'sentiment_confidence': sentiment_confidence,
            'count_positive': count_positive,
            'count_negative': count_negative,
            'total_posts': len(asset_articles)
        })

df_daily = pd.DataFrame(daily_features)
df_daily.to_csv('daily_llm_features_reddit.csv', index=False)

# Create observation vector
obs_vector = []
asset_order = ["SP_500", "NASDAQ", "Dow_Jones", "Gold", "Silver", "Oil"]

for asset in asset_order:
    asset_data = df_daily[df_daily['Asset'] == asset]
    if len(asset_data) > 0:
        row = asset_data.iloc[0]
        obs_vector.extend([
            row['sentiment_score'],
            row['sentiment_volatility'],
            row['volatility_signal'],
            row['article_frequency'] / 100.0,
            row['sentiment_confidence']
        ])
    else:
        obs_vector.extend([0, 0, 0, 0, 0])

obs_df = pd.DataFrame([obs_vector], columns=[
    'SP_500_sentiment_score', 'SP_500_sentiment_volatility', 'SP_500_volatility_signal', 'SP_500_article_frequency', 'SP_500_sentiment_confidence',
    'NASDAQ_sentiment_score', 'NASDAQ_sentiment_volatility', 'NASDAQ_volatility_signal', 'NASDAQ_article_frequency', 'NASDAQ_sentiment_confidence',
    'Dow_Jones_sentiment_score', 'Dow_Jones_sentiment_volatility', 'Dow_Jones_volatility_signal', 'Dow_Jones_article_frequency', 'Dow_Jones_sentiment_confidence',
    'Gold_sentiment_score', 'Gold_sentiment_volatility', 'Gold_volatility_signal', 'Gold_article_frequency', 'Gold_sentiment_confidence',
    'Silver_sentiment_score', 'Silver_sentiment_volatility', 'Silver_volatility_signal', 'Silver_article_frequency', 'Silver_sentiment_confidence',
    'Oil_sentiment_score', 'Oil_sentiment_volatility', 'Oil_volatility_signal', 'Oil_article_frequency', 'Oil_sentiment_confidence'
])

obs_df.to_csv('rl_observation_vectors_llm_reddit.csv', index=False)



print(f"\nSamples analyzed: {len(articles_for_sentiment)}")
print(f"Assets covered: {len(assets)}")
print(f"Output: rl_observation_vectors_llm_reddit.csv")
print("\nSentiment Summary:")
print(df_daily[['Asset', 'sentiment_score', 'count_positive', 'count_negative', 'total_posts']])




Samples analyzed: 600
Assets covered: 6
Output: rl_observation_vectors_llm_reddit.csv

Sentiment Summary:
       Asset  sentiment_score  count_positive  count_negative  total_posts
0  Dow_Jones             0.00              16              16          100
1        Oil            -0.03               6               9          100
2     Silver            -0.03               6               9          100
3     NASDAQ             0.00              16              16          100
4     SP_500             0.00              16              16          100
5       Gold            -0.03               6               9          100


In [None]:
import pandas as pd
import os

print("VERIFYING OUTPUT FILES")

# Check if files exist
files = ['rl_observation_vectors_llm_reddit.csv', 'daily_llm_reddit.csv']

for file in files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        print(f"File: {file}")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns)[:5]}...")
        print(f"  Data:\n{df.head(2)}\n")
    else:
        print(f"File NOT found: {file}\n")

# Load the actual RL observation vector
obs = pd.read_csv('rl_observation_vectors_llm_reddit.csv')

print("RL OBSERVATION VECTOR (30 features)")

print(f"\nShape: {obs.shape}")
print(f"Features: {list(obs.columns)}")
print(f"\nFirst 10 features (values):")
for i, col in enumerate(list(obs.columns)[:10]):
    print(f"  {col}: {obs[col].values[0]:.4f}")




VERIFYING OUTPUT FILES
File: rl_observation_vectors_llm_reddit.csv
  Shape: (1, 30)
  Columns: ['SP_500_sentiment_score', 'SP_500_sentiment_volatility', 'SP_500_volatility_signal', 'SP_500_article_frequency', 'SP_500_sentiment_confidence']...
  Data:
   SP_500_sentiment_score  SP_500_sentiment_volatility  \
0                     0.0                     0.284853   

   SP_500_volatility_signal  SP_500_article_frequency  \
0                      1.33                       1.0   

   SP_500_sentiment_confidence  NASDAQ_sentiment_score  \
0                     0.827269                     0.0   

   NASDAQ_sentiment_volatility  NASDAQ_volatility_signal  \
0                     0.284853                      1.33   

   NASDAQ_article_frequency  NASDAQ_sentiment_confidence  ...  \
0                       1.0                     0.827269  ...   

   Silver_sentiment_score  Silver_sentiment_volatility  \
0                   -0.03                     0.189641   

   Silver_volatility_signal  Si