In [14]:
from dotenv import load_dotenv
load_dotenv()
import os
import logging
from src.extractors.bigquery_extractor import BigQueryExtractor
def setup_environment(test_mode=True):
    """Set up the environment for testing."""
    # Set environment variables for testing if they're not set
    if 'GOOGLE_CLOUD_PROJECT_ID' not in os.environ:
        os.environ['GOOGLE_CLOUD_PROJECT_ID'] = 'alex-stocks'
    
    if 'BIGQUERY_DATASET' not in os.environ:
        os.environ['BIGQUERY_DATASET'] = 'reddit_stock_test'


    

    # Create a test state collection to avoid affecting production timestamps
    os.environ['FIRESTORE_STATE_COLLECTION'] = 'pipeline_state_test'
    logger.info("Using test state collection: pipeline_state_test")

def setup_bigquery():
    """Set up BigQuery tables."""
    from src.utils.bigquery_utils import BigQueryManager
    bq_manager = BigQueryManager()
    bq_manager.setup_tables()
    return bq_manager

    
def check_environment():
    """Check if all required environment variables are set."""
    required_vars = [
        'GOOGLE_CLOUD_PROJECT_ID',
        'BIGQUERY_DATASET'
    ]
    
    missing_vars = []
    for var in required_vars:
        if not os.getenv(var):
            missing_vars.append(var)
    
    if missing_vars:
        logger.error(f"Missing required environment variables: {', '.join(missing_vars)}")
        return False
    
    return True
    
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [13]:

setup_environment()

# Check environment variables
if not check_environment():
    logger.error("Environment check failed, exiting")
    sys.exit(1)
    

2025-04-11 18:32:17,611 - __main__ - INFO - Using test state collection: pipeline_state_test


In [26]:
extractor = BigQueryExtractor()
df = extractor.get_reddit_data(limit=1000)


2025-04-11 19:04:23,001 - src.extractors.bigquery_extractor - INFO - Fetching Reddit data from BigQuery for stock analysis
2025-04-11 19:04:23,003 - src.extractors.bigquery_extractor - INFO - Initial run: Fetching all Reddit data from BigQuery (no time filter)
2025-04-11 19:04:26,313 - src.extractors.bigquery_extractor - INFO - Retrieved 1000 deduplicated Reddit posts/comments from BigQuery


In [158]:
df[df['content'].str.contains('%', case=False)]

Unnamed: 0,message_id,content,author,created_at,subreddit,title,url,score,message_type
1,mmky2yy,We are one tweet away from being -4%. Seems li...,furrypurpledinosaur,2025-04-11 15:51:12+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,5,REDDIT_COMMENT
23,mmkxgxh,"TSLA -3% on a flat day, can't even be mad",Aivoke_art,2025-04-11 15:48:14+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,15,REDDIT_COMMENT
40,mmkx64h,"Dang, yall aint got no more of those 5% days?\...",Damerman,2025-04-11 15:46:49+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,12,REDDIT_COMMENT
48,mmkx1o7,5% SPY day!!!!!! Surely!!!,TurbodToilet,2025-04-11 15:46:13+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,7,REDDIT_COMMENT
53,mmkwzad,"/investing be like, ""You should be 100% invest...",falling_knives,2025-04-11 15:45:54+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,22,REDDIT_COMMENT
...,...,...,...,...,...,...,...,...,...
889,mmknaq8,"The gambler's delusional mind.\n\n""I am in con...",Happy_Discussion_536,2025-04-11 14:58:31+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,8,REDDIT_COMMENT
896,mmkn8ob,Update: I have reinvested the $00.56 interest ...,qui_tam_gogh,2025-04-11 14:58:14+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,5,REDDIT_COMMENT
897,mmkn8ha,pls move I need a -10% day ![img](emote|t5_2th...,leandrostonks,2025-04-11 14:58:12+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,10,REDDIT_COMMENT
962,mmkmks9,Consumer sentiment gonna hit 0% and this marke...,weewoowewoooo,2025-04-11 14:54:59+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,12,REDDIT_COMMENT


In [180]:
from src.utils import stock_analyzer
import importlib
importlib.reload(stock_analyzer)

analyzer = stock_analyzer.StockAnalyzer()


Device set to use cpu
2025-04-11 21:04:36,319 - src.utils.stock_analyzer - INFO - Loading stock tickers
2025-04-11 21:04:36,913 - src.utils.stock_analyzer - INFO - Loading stock tickers from BigQuery
2025-04-11 21:04:38,350 - src.utils.stock_analyzer - INFO - Loaded 1000 stock tickers from BigQuery


In [167]:
index_test = 23
row = df.sort_values('score', ascending=False).loc[index_test]
text= df.sort_values('score', ascending=False).loc[index_test].content
mentioned_tickers = analyzer.extract_stock_mentions(text)

In [168]:
texts = []
scores = []
ticker_contexts = []
tickers_to_analyze = []

for ticker in mentioned_tickers:
    context = analyzer.extract_ticker_context(text, ticker, window_size=100) or text[:500]
    texts.append(context[:512])
    scores.append(row['score'] if isinstance(row['score'], (int, float, np.int64)) else 0)
    ticker_contexts.append(context)
    tickers_to_analyze.append((row, ticker))  # store row and ticker together
    

In [169]:
analyzer.extract_ticker_context(text, ticker, window_size=20)

"TSLA -3% on a flat day, can't even be mad"

In [170]:
texts[0]

"TSLA -3% on a flat day, can't even be mad"

In [171]:
analyzer.extract_price_and_percent_signals(texts[0])

['PT:3.0', 'CHANGE:-3.0%']

In [143]:
scores

[8]

In [144]:
sentiments = analyzer.analyze_sentiment_batch(texts, scores)


In [145]:
 results = analyzer.sentiment_pipeline(texts, batch_size=16)

In [146]:
sentiments

[{'compound': -0.963,
  'positive': 0.018608255311846733,
  'negative': 0.9813917875289917,
  'neutral': 0.0,
  'confidence': 0.74}]

In [147]:
output = []
for i, res in enumerate(results):
    # Handle case where res is a list of dictionaries (newer pipeline format)
    if isinstance(res, list):
        # Find the highest scoring sentiment
        highest_score = 0
        label = "NEUTRAL"
        conf = 0.0
        
        for item in res:
            if 'score' in item and item['score'] > highest_score:
                highest_score = item['score']
                label = item['label']
                conf = item['score']
    else:
        # Handle the original format where res is a dictionary
        label = res['label']
        conf = res['score']

    sentiment = {
        "compound": round(conf if label == "POSITIVE" else -conf if label == "NEGATIVE" else 0.0, 3),
        "positive": conf if label == "POSITIVE" else 0.0,
        "negative": conf if label == "NEGATIVE" else 0.0,
        "neutral": conf if label == "NEUTRAL" else 0.0,
    }

    confidence = analyzer.calc_confidence_score(sentiment['compound'], scores[i])
    output.append({
        **sentiment,
        "confidence": confidence
    })


In [148]:
ticker_contexts

["Let's test SPY to 500$ just to see"]

In [157]:
analyzer.extract_signals_regex("Let's test SPY to 50.12 just to see", 'SPY'),

([],)

In [172]:
test_df = df[df['content'].str.contains('%', case=False)]

In [177]:
test_df['score']

1       5
23     15
40     12
48      7
53     22
       ..
889     8
896     5
897    10
962    12
991     5
Name: score, Length: 80, dtype: int64

In [191]:
test_df

Unnamed: 0,message_id,content,author,created_at,subreddit,title,url,score,message_type
1,mmky2yy,We are one tweet away from being -4%. Seems li...,furrypurpledinosaur,2025-04-11 15:51:12+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,5,REDDIT_COMMENT
23,mmkxgxh,"TSLA -3% on a flat day, can't even be mad",Aivoke_art,2025-04-11 15:48:14+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,15,REDDIT_COMMENT
40,mmkx64h,"Dang, yall aint got no more of those 5% days?\...",Damerman,2025-04-11 15:46:49+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,12,REDDIT_COMMENT
48,mmkx1o7,5% SPY day!!!!!! Surely!!!,TurbodToilet,2025-04-11 15:46:13+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,7,REDDIT_COMMENT
53,mmkwzad,"/investing be like, ""You should be 100% invest...",falling_knives,2025-04-11 15:45:54+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,22,REDDIT_COMMENT
...,...,...,...,...,...,...,...,...,...
889,mmknaq8,"The gambler's delusional mind.\n\n""I am in con...",Happy_Discussion_536,2025-04-11 14:58:31+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,8,REDDIT_COMMENT
896,mmkn8ob,Update: I have reinvested the $00.56 interest ...,qui_tam_gogh,2025-04-11 14:58:14+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,5,REDDIT_COMMENT
897,mmkn8ha,pls move I need a -10% day ![img](emote|t5_2th...,leandrostonks,2025-04-11 14:58:12+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,10,REDDIT_COMMENT
962,mmkmks9,Consumer sentiment gonna hit 0% and this marke...,weewoowewoooo,2025-04-11 14:54:59+00:00,wallstreetbets,,https://reddit.com/r/wallstreetbets/comments/1...,12,REDDIT_COMMENT


In [195]:
stock_mentions = []
results = analyzer._process_batch(df)
# Flatten results list
stock_mentions.extend(results)


2025-04-11 21:11:19,432 - src.utils.stock_analyzer - INFO - Processed batch and found 368 stock mentions


NameError: name 'batches' is not defined

In [196]:
stock_mentions

[StockMention(message_id='mmkyebj', ticker='IMG', author='bggie_G', created_at=Timestamp('2025-04-11 15:52:41+0000', tz='UTC'), subreddit='wallstreetbets', url='https://reddit.com/r/wallstreetbets/comments/1jwm55t/daily_discussion_thread_for_april_11_2025/mmkyebj/', score=13, message_type='REDDIT_COMMENT', sentiment_compound=-0.998, sentiment_positive=0.0008471508044749498, sentiment_negative=0.9991528987884521, sentiment_neutral=0.0, signals=['PT:5.0'], context='my biggest nightmare is mango saying sorry and drop all tariffs ![img](emote|t5_2th52|4260)', confidence=0.78, etl_timestamp=datetime.datetime(2025, 4, 11, 18, 11, 10, 771002)),
 StockMention(message_id='mmkxtkr', ticker='IMG', author='Reasonable-Big4517', created_at=Timestamp('2025-04-11 15:49:57+0000', tz='UTC'), subreddit='wallstreetbets', url='https://reddit.com/r/wallstreetbets/comments/1jwm55t/daily_discussion_thread_for_april_11_2025/mmkxtkr/', score=8, message_type='REDDIT_COMMENT', sentiment_compound=-0.99, sentiment_

In [188]:
stock_mentions.extend(results)

In [197]:
len(stock_mentions)

368

In [198]:
import os

In [199]:
num_processes = min(os.cpu_count(), len([1,2]))


In [200]:
num_processes

2

In [201]:
os.cpu_count()

8