# Business Logic
---
This is going to be an experimentation for natural language processing in the context of finance. Using libraries like FinBERT, and LDA, we shall leverage language models to help us inform people about what's going in the world of a specific company. Practices will be followed by extracting the article for a given stock and reusing
the methodddology for analysis here.
---


# Importing Libraries

In [None]:
!pip install feedparser



In [None]:
import yfinance as yf
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer
import feedparser
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import torch

# Text extraction

In [None]:
class YahooFinanceFullArticleScraper:
    """
    Extracts full article content from Yahoo Finance RSS feeds
    Uses RSS for article discovery, then fetches full content from article URLs
    """

    def __init__(self):
        self.base_rss_url = "https://feeds.finance.yahoo.com/rss/2.0/headline?s={}&region=US&lang=en-US"
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    def _extract_article_text(self, url):
        """Extract full article text from a Yahoo Finance article URL"""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Find article content - Yahoo Finance uses various selectors
            article_content = None

            # Try common article content selectors
            selectors = [
                'article',
                '[data-module="ArticleBody"]',
                '.caas-body',
                '.article-body',
                '[class*="article"]',
                '[class*="content"]'
            ]

            for selector in selectors:
                article_content = soup.select_one(selector)
                if article_content:
                    break

            if not article_content:
                # Fallback: find main content area
                article_content = soup.find('main') or soup.find('article')

            if article_content:
                # Remove script and style elements
                for script in article_content(["script", "style", "nav", "footer", "header"]):
                    script.decompose()

                # Extract text and clean it
                text = article_content.get_text(separator=' ', strip=True)
                # Clean up multiple whitespaces
                text = re.sub(r'\s+', ' ', text).strip()
                return text

            return None

        except Exception as e:
            return None

    def get_full_articles_for_ticker(self, ticker, max_articles=10, verbose=False):
        """
        Get full article content for a ticker symbol

        Args:
            ticker (str): Stock ticker symbol
            max_articles (int): Maximum number of articles to fetch
            verbose (bool): Print progress messages

        Returns:
            list: List of articles with full text content
        """
        articles = []

        if verbose:
            print(f"Fetching RSS feed for {ticker}...")

        try:
            # Get RSS feed
            feed_url = self.base_rss_url.format(ticker.upper())
            feed = feedparser.parse(feed_url)

            if not feed.entries:
                if verbose:
                    print(f"  ‚ö† No articles found for {ticker}")
                return articles

            if verbose:
                print(f"Found {len(feed.entries)} articles in RSS feed")
                print(f"Fetching full content (filtering for articles >150 words)..\n")

            # Process articles until we have max_articles that meet the word count requirement
            articles_processed = 0
            articles_skipped = 0

            for entry in feed.entries:
                # Stop if we have enough articles
                if len(articles) >= max_articles:
                    break

                articles_processed += 1
                article_url = entry.get('link', '').strip()
                title = entry.get('title', '').strip()

                if not article_url:
                    continue

                if verbose:
                    title_short = title[:60] + "..." if len(title) > 60 else title
                    print(f"[{articles_processed}] Fetching: {title_short}...")

                # Extract full article text
                full_text = self._extract_article_text(article_url)

                # Parse publication date
                published = entry.get('published', '')
                published_datetime = None
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    try:
                        published_datetime = datetime(*entry.published_parsed[:6])
                    except:
                        pass

                # Calculate word count
                word_count = len(full_text.split()) if full_text else 0

                # Filter: Only keep articles with more than 150 words
                if word_count <= 150:
                    articles_skipped += 1
                    if verbose:
                        print(f"    ‚ö† Skipped: {word_count} words (minimum 150 required)")
                    continue

                article = {
                    'ticker': ticker.upper(),
                    'title': title,
                    'link': article_url,
                    'rss_description': entry.get('summary', '').strip(),
                    'published': published,
                    'published_datetime': published_datetime,
                    'guid': entry.get('guid', ''),
                    'full_text': full_text or '',
                    'word_count': word_count,
                    'has_full_text': full_text is not None and len(full_text) > 0
                }

                articles.append(article)

                if verbose and full_text:
                    print(f"    ‚úì Retrieved {word_count} words")
                elif verbose:
                    print(f"    ‚ö† Could not extract content")

            # Summary
            if verbose:
                print(f"\nüìä Summary:")
                print(f"   Articles processed: {articles_processed}")
                print(f"   Articles skipped (<150 words): {articles_skipped}")
                print(f"   Articles returned: {len(articles)}")

            return articles

        except Exception as e:
            if verbose:
                print(f"  ‚úó Error: {e}")
            return articles

# Initialize scraper
scraper = YahooFinanceFullArticleScraper()

# Get full articles for a ticker (e.g., 'AAPL')
# The 'articles' variable will contain a list of dictionaries, each with full article content
articles_for_nlp = scraper.get_full_articles_for_ticker('AAPL', max_articles=5, verbose=False)

In [None]:
articles_for_nlp[0]['full_text']

"Apple, Nvidia, Verizon, IBM, & more: 2025's stock pick standouts Yahoo Finance Video Wed, December 3, 2025 at 5:35 PM EST AAPL IONQ BAC JPM NVDA Apple ( AAPL ), Nvidia ( NVDA ), Amazon ( AMZN ), and Microsoft ( MSFT ) are just some of the stocks in focus as Yahoo Finance Data and Markets Editor Jared Blikre and Trader Talk host Kenny Polcari take a closer look at the standout names of 2025. Watch the video above to hear more top stock picks and key investment themes. To watch more expert insights and analysis on the latest market action, check out more Market Domination Overtime . Video Transcript 00:00 Speaker A Kenny, I love breaking down tickers with you and we can do so in line with some of the biggest themes this year. And uh we've done this in some other segments here. So I want to hit some tickers that we haven't and this is Apple. This is year to date and what I want to point out here is Apple was really late to the party off of those April 8th low. Finally got started around 

In [None]:
articles_for_nlp[4]['full_text']

'Demand for the iPhone 17 is propelling the outlook for the whole smartphone sector. Annice Lyn / Getty Images Close Key Takeaways Global smartphone shipments are expected to be 1.5% higher in 2025, up from the previous prediction of a 1% gain, thanks to demand for Apple\'s iPhone 17, according to tech information provider International Data Corporation. IDC said Apple will post a record number of shipments this year because of the success of the iPhone 17, led by demand in China. Apple shares hit a new all-time high Wednesday morning. A new report predicts worldwide smartphone shipments for 2025 will be up 1.5% from 2024, driven by a record-setting performance by Apple ( AAPL ). The Worldwide Quarterly Mobile Phone Tracker from tech information provider International Data Corporation raised its previous outlook of a 1% gain, primarily on the expected soaring demand for Apple‚Äôs new iPhone 17 this holiday season. Nabila Popal, senior research director with the study, said that it‚Äôs 

## Model load

### FinBERT

In [None]:
model_name = "ProsusAI/finbert"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

# Sentiment analysis

### AAPL

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[0]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Apple, Nvidia, Verizon, IBM, & more: 2025's stock pick standouts Yahoo Finance Video Wed, December 3, 2025 at 5:35 PM EST AAPL IONQ BAC JPM NVDA Apple ( AAPL ), Nvidia ( NVDA ), Amazon ( AMZN ), and Microsoft ( MSFT ) are just some of the stocks in focus as Yahoo Finance Data and Markets Editor Jared Blikre and Trader Talk host Kenny Polcari take a closer look at the standout names of 2025. Watch the video above to hear more top stock picks and key investment themes. To watch more expert insights and analysis on the latest market action, check out more Market Domination Overtime . Video Transcript 00:00 Speaker A Kenny, I love breaking down tickers with you and we can do so in line with some of the biggest themes this year. And uh we've done this in some other segments here. So I want to hit some tickers that we haven't and this is Apple. This is year to date and what I want to point out here is Apple was really late to the party off of those April 8th low. Finally got st

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[1]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Apple's stock is hitting new highs: How to play it with options Yahoo Finance Video and Josh Lipton Wed, December 3, 2025 at 4:26 PM EST AAPL 2025 has been a wild year for Apple ( AAPL ) investors. The stock was out of favor earlier in the year, with investors believing it was lagging in AI. The stock has turned around, though, and is now up about 40% over the last six months, starting to hit new all-time highs at the end of November. StockBrokers.com Director of Investor Research Jessica Inskip explains the trade and why it's a good one for a first-time options investor. To watch more expert insights and analysis on the latest market action, check out more Market Domination . Video Transcript 00:00 Speaker A Apple shares on track here to snap a seven-day wind streak, but this under the radar rally has some investors looking for ways to play the tech giant. Join me now, we've got Jessica Inskip, stockbrokers.com, Director of investor research in the options pit sponsored 

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[2]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Apple Design Executive Alan Dye Poached by Meta in Major Coup Mark Gurman Wed, December 3, 2025 at 4:09 PM EST 3 min read AAPL META (Bloomberg) -- Meta Platforms Inc. has poached Apple Inc.‚Äôs most prominent design executive in a major coup that underscores a push by the social networking giant into AI-equipped consumer devices. The company is hiring Alan Dye, who has served as the head of Apple‚Äôs user interface design team since 2015, according to people with knowledge of the matter. Apple is replacing Dye with longtime designer Stephen Lemay, according to the people, who asked not to be identified because the personnel changes haven‚Äôt been announced. Most Read from Bloomberg Steve Cohen, Bally‚Äôs, Genting Picked to Run Casinos in NYC Wealthy New Jersey Town‚Äôs Vote on Fixing School Deficit Canceled Alan Dye, attending a GQ event last month, is leaving Apple for Meta.Photographer: Stefanie Keenan/Getty Images Apple confirmed the move in a statement provided to Blo

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[3]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: AI bubble talk isn't constructive as trade is in 'early innings' Yahoo Finance Video and Josh Lipton Wed, December 3, 2025 at 3:47 PM EST AAPL NVDA GOOG ORCL ^DJI Wall Street has gone back and forth over whether the market ( ^DJI , ^IXIC , ^GSPC ) is in an AI bubble or not. Laffer Tengler Investments CEO and CIO Nancy Tengler just doesn't see the market being in a bubble yet. Tengler speaks with Josh Lipton on Market Domination about the difference between the current AI boom and the dot-com crash of the 1990s, and how she expects to invest in the AI trade amid shifting sentiments on the ecosystem's top contenders. To watch more expert insights and analysis on the latest market action, check out more Market Domination . Video Transcript 00:00 Jon Nancy, always great to see you, especially on set. 00:02 Nancy I know. 00:02 Jon Uh, let's start with the great AI bubble debate, because I know you push back on that. Why do you push back, Nancy? 00:09 Nancy So Jon, I mean, list

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[4]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Demand for the iPhone 17 is propelling the outlook for the whole smartphone sector. Annice Lyn / Getty Images Close Key Takeaways Global smartphone shipments are expected to be 1.5% higher in 2025, up from the previous prediction of a 1% gain, thanks to demand for Apple's iPhone 17, according to tech information provider International Data Corporation. IDC said Apple will post a record number of shipments this year because of the success of the iPhone 17, led by demand in China. Apple shares hit a new all-time high Wednesday morning. A new report predicts worldwide smartphone shipments for 2025 will be up 1.5% from 2024, driven by a record-setting performance by Apple ( AAPL ). The Worldwide Quarterly Mobile Phone Tracker from tech information provider International Data Corporation raised its previous outlook of a 1% gain, primarily on the expected soaring demand for Apple‚Äôs new iPhone 17 this holiday season. Nabila Popal, senior research director with the study, said 

### Bitcoin (BTC-USD)

In [None]:
articles_for_nlp = scraper.get_full_articles_for_ticker('BTC-USD', max_articles=5, verbose=False)

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[0]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Coin Prices BTC $93,394.00 1.60% ETH $3,211.59 6.55% XRP $2.20 2.16% BNB $925.19 4.84% SOL $145.78 5.38% USDC $0.999804 -0.01% STETH $3,209.19 6.60% TRX $0.280106 -0.33% DOGE $0.152031 4.13% ADA $0.451444 4.06% FIGR_HELOC $1.025 2.28% WBT $62.77 1.73% WSTETH $3,916.31 6.55% BCH $591.42 7.58% WBTC $93,363.00 1.84% WBETH $3,478.27 6.51% LINK $14.79 8.67% HYPE $35.63 6.02% USDS $0.999832 -0.00% WETH $3,209.65 6.52% BSC-USD $1.00 0.01% LEO $9.59 1.12% XLM $0.25727 1.13% WEETH $3,475.67 6.58% XMR $405.93 1.94% USDE $1.005 0.54% CBBTC $93,418.00 1.64% LTC $86.31 3.65% SUI $1.70 4.33% AVAX $14.77 7.85% HBAR $0.145412 0.82% ZEC $348.61 14.42% SHIB $0.00000898 5.57% WLFI $0.1619 1.44% SUSDS $1.069 -0.94% CRO $0.110395 2.11% TON $1.65 4.88% UNI $6.14 3.33% DOT $2.36 4.14% PYUSD $0.999889 -0.01% SUSDE $1.21 0.28% USDT0 $1.00 -0.02% MNT $1.10 6.95% AAVE $197.62 3.69% TAO $300.34 8.75% CC $0.078151 1.23% USD1 $0.999112 -0.02% BGB $3.60 -0.44% NEAR $1.86 3.49% BUIDL $1.00 0.00% M $1.36

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[1]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: 3 Ways to Trade Bitcoin‚Äôs Big Comeback While Hedging Against a Permanent Crypto Winter Rob Isbitts - Barchart - Wed Dec 3, 1:37PM CST Columnist All information and data in this article is solely for informational purposes. For more information please view the Barchart Disclosure Policy here Share Bitcoin and cash by David McBee via Pexels I hope you‚Äôre up for some math. Because when a market event like the recent slide in Bitcoin (BTCUSD) , coupled with Tuesday‚Äôs attempted ‚Äúbuy the dip‚Äù rally occurs, my number-crunching side really takes over. And for good reason, I think. Because these are the times when the math of risk management couples with the apparent never-say-die attitude of cryptocurrency traders. The result is some of the best reward/risk tradeoffs we‚Äôll see this year. Or next. Why? Because the math makes it so. Specifically, the potential for a deep selloff in a stock or ETF to create a bigger percentage gain required to recover its old high. So if

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[2]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Cocoa Prices Settle Mixed on Forex Movements Rich Asplund - Barchart - Wed Dec 3, 1:30PM CST Columnist All information and data in this article is solely for informational purposes. For more information please view the Barchart Disclosure Policy here Share Bar of chocolate by Karandaev via iStock Exclusive offer! Open & fund a Plus500 futures account & trade to get a FREE 1-year Barchart Premier subscription March ICE NY cocoa ( CCH26 ) on Wednesday closed up +49 (+0.90%), and March ICE London cocoa #7 ( CAH26 ) closed down -28 (-0.69%). Cocoa prices settled mixed on Wednesday. Currency fluctuations on Wednesday affected cocoa prices. NY cocoa rose on Wednesday amid a weaker dollar, as the dollar index ( DXY00 ) tumbled to a 5-week low. However, London cocoa was under pressure on Wednesday after the British pound ( ^GBPUSD ) rallied to a 5-week high, undercutting cocoa priced in sterling. Cocoa prices are being undercut by generally favorable weather in West Africa, which

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[3]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Bitcoin breaks back above $92K, Pure Storage stock sinks Yahoo Finance Video and Josh Lipton Wed, December 3, 2025 at 2:08 PM EST BTC-USD PSTG ^GSPC ^DJI ^IXIC Yahoo Finance host Josh Lipton tracks today's top moving stocks and biggest market stories in this Market Minute, including US stocks ( ^DJI , ^IXIC , ^GSPC ) holding onto gain in Wednesday's session, bitcoin ( BTC-USD ) breaks back above $92,000 per token, and Pure Storage ( PSTG ) shares sinking on the company's guidance. Stay up to date on the latest market action, minute-by-minute, with Yahoo Finance's Market Minute. Video Transcript 00:01 Speaker A It's time for Yahoo Finance's Market Minute. US stocks shaking off early losses as traders move past the decline in private sector employment, while doubts over AI demand put pressure on tech, but small cap stocks shining as investors increase bets on a Fed rate cut in December. 00:15 Speaker A And Bitcoin is able to break above 92,000. That is a key level traders h

In [None]:
# Extract article text (summary or description, fallback to title)
article_text = articles_for_nlp[4]['full_text']

# Tokenize the extracted text
inputs = tokenizer(article_text, return_tensors="pt", padding=True, truncation=True)

# Pass tokenized input through the model
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.softmax(outputs.logits, dim=1)

# Get the predicted sentiment
predicted_class_id = probabilities.argmax().item()
sentiment = model.config.id2label[predicted_class_id]

print(f"Article Text: {article_text}")
print(f"Sentiment Probabilities: {probabilities}")
print(f"Predicted Sentiment: {sentiment}")

Article Text: Bitcoin Hits Two-Week High in Cautious Crypto Market Recovery Suvashree Ghosh and Emily Nicolle Wed, December 3, 2025 at 1:29 PM EST 3 min read MSTR STRC STRD STRF STRK (Bloomberg) -- Bitcoin extended a tentative rebound on Wednesday, climbing to a two-week high as traders look for signs that the wider crypto market may be regaining its footing after a prolonged selloff. The original cryptocurrency rose as much as 2.6% to about $93,965, its highest intraday level since Nov. 17. Ether gained more, rising more than 4%, as Ethereum undergoes a network upgrade called Fusaka that aims to make the blockchain faster and more efficient. Other smaller tokens were mixed. Most Read from Bloomberg Steve Cohen, Bally‚Äôs, Genting Picked to Run Casinos in NYC Wealthy New Jersey Town‚Äôs Vote on Fixing School Deficit Canceled The digital assets market remains on shaky ground after a bruising selloff that began in early October, just days after Bitcoin hit a record of over $126,000. Sinc

# LDA

## Apple (AAPL)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

class StockArticleLDA:
    """
    LDA Topic Extraction for Stock News Articles
    """

    def __init__(self, n_topics=3, max_iter=50, random_state=42):
        """
        Initialize LDA analyzer

        Parameters:
        -----------
        n_topics : int, default=3
            Number of topics to extract (recommended 2-4 for 5 articles)
        max_iter : int, default=50
            Maximum iterations for LDA
        random_state : int, default=42
            Random state for reproducibility
        """
        self.n_topics = n_topics
        self.max_iter = max_iter
        self.random_state = random_state
        self.lemmatizer = WordNetLemmatizer()

        # Enhanced stopwords (general + finance-specific)
        self.stop_words = set(stopwords.words('english'))
        finance_stopwords = {
            'said', 'say', 'says', 'company', 'companies', 'stock', 'stocks',
            'share', 'shares', 'market', 'markets', 'new', 'also', 'would',
            'could', 'may', 'might', 'one', 'two', 'first', 'last', 'year',
            'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'
        }
        self.stop_words.update(finance_stopwords)

        self.vectorizer = None
        self.lda_model = None
        self.feature_names = None

    def preprocess_text(self, text):
        """
        Clean and preprocess text

        Parameters:
        -----------
        text : str
            Raw article text

        Returns:
        --------
        str : Cleaned and lemmatized text
        """
        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)

        # Remove special characters and digits (keeping spaces)
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords, short words, and lemmatize
        cleaned_tokens = []
        for token in tokens:
            if (len(token) >= 3 and
                token not in self.stop_words and
                token not in string.punctuation):
                lemmatized = self.lemmatizer.lemmatize(token, pos='v')  # Verb lemmatization
                lemmatized = self.lemmatizer.lemmatize(lemmatized, pos='n')  # Noun lemmatization
                cleaned_tokens.append(lemmatized)

        return ' '.join(cleaned_tokens)

    def extract_articles(self, articles_dict, n_articles=5):
        """
        Extract and preprocess articles from dictionary

        Parameters:
        -----------
        articles_dict : dict
            Dictionary containing articles with 'full_text' key
        n_articles : int, default=5
            Number of articles to extract

        Returns:
        --------
        list : List of preprocessed article texts
        """
        preprocessed_articles = []

        for i in range(n_articles):
            try:
                raw_text = articles_dict[i]['full_text']
                cleaned_text = self.preprocess_text(raw_text)
                preprocessed_articles.append(cleaned_text)
                print(f"‚úì Article {i} preprocessed ({len(cleaned_text)} characters)")
            except (KeyError, IndexError) as e:
                print(f"‚ö† Warning: Could not process article {i}: {e}")
                continue

        return preprocessed_articles

    def fit_lda(self, articles, use_tfidf=True, max_features=1000, min_df=1, max_df=0.95):
        """
        Fit LDA model on preprocessed articles

        Parameters:
        -----------
        articles : list
            List of preprocessed article texts
        use_tfidf : bool, default=True
            Use TF-IDF vectorization (recommended for news articles)
        max_features : int, default=1000
            Maximum number of features for vectorization
        min_df : int/float, default=1
            Minimum document frequency
        max_df : float, default=0.95
            Maximum document frequency (remove very common terms)

        Returns:
        --------
        self : Returns instance for method chaining
        """
        if len(articles) < 2:
            raise ValueError("Need at least 2 articles for LDA. Found: {}".format(len(articles)))

        print(f"\nüìä Vectorizing {len(articles)} articles...")

        # Vectorization with TF-IDF or Count
        if use_tfidf:
            self.vectorizer = TfidfVectorizer(
                max_features=max_features,
                min_df=min_df,
                max_df=max_df,
                ngram_range=(1, 2)  # Include bigrams for better topic coherence
            )
        else:
            self.vectorizer = CountVectorizer(
                max_features=max_features,
                min_df=min_df,
                max_df=max_df,
                ngram_range=(1, 2)
            )

        # Transform articles to document-term matrix
        doc_term_matrix = self.vectorizer.fit_transform(articles)
        self.feature_names = self.vectorizer.get_feature_names_out()

        print(f"‚úì Document-term matrix shape: {doc_term_matrix.shape}")
        print(f"‚úì Vocabulary size: {len(self.feature_names)}")

        # Fit LDA model
        print(f"\nüîç Fitting LDA with {self.n_topics} topics...")
        self.lda_model = LatentDirichletAllocation(
            n_components=self.n_topics,
            max_iter=self.max_iter,
            random_state=self.random_state,
            learning_method='batch',
            n_jobs=-1
        )

        self.lda_model.fit(doc_term_matrix)

        print(f"‚úì LDA model fitted successfully")
        print(f"‚úì Log-likelihood: {self.lda_model.score(doc_term_matrix):.2f}")
        print(f"‚úì Perplexity: {self.lda_model.perplexity(doc_term_matrix):.2f}")

        return self

    def get_top_words_per_topic(self, n_words=10):
        """
        Extract top words for each topic

        Parameters:
        -----------
        n_words : int, default=10
            Number of top words to extract per topic

        Returns:
        --------
        dict : Dictionary with topic numbers as keys and lists of top words as values
        """
        if self.lda_model is None:
            raise ValueError("Model not fitted. Call fit_lda() first.")

        topics_dict = {}

        for topic_idx, topic in enumerate(self.lda_model.components_):
            top_indices = topic.argsort()[-n_words:][::-1]
            top_words = [self.feature_names[i] for i in top_indices]
            top_weights = [topic[i] for i in top_indices]

            topics_dict[f"Topic {topic_idx + 1}"] = {
                'words': top_words,
                'weights': top_weights
            }

        return topics_dict

    def get_document_topic_distribution(self, articles):
        """
        Get topic distribution for each article

        Parameters:
        -----------
        articles : list
            List of preprocessed article texts

        Returns:
        --------
        numpy.ndarray : Array of shape (n_articles, n_topics) with topic probabilities
        """
        if self.lda_model is None or self.vectorizer is None:
            raise ValueError("Model not fitted. Call fit_lda() first.")

        doc_term_matrix = self.vectorizer.transform(articles)
        topic_distributions = self.lda_model.transform(doc_term_matrix)

        return topic_distributions

    def display_results(self, articles, n_words=10):
        """
        Display comprehensive LDA results

        Parameters:
        -----------
        articles : list
            List of preprocessed article texts
        n_words : int, default=10
            Number of top words to display per topic
        """
        print("\n" + "="*80)
        print("üì∞ LDA TOPIC EXTRACTION RESULTS")
        print("="*80)

        # Display topics
        topics = self.get_top_words_per_topic(n_words)

        for topic_name, topic_data in topics.items():
            print(f"\n{topic_name}:")
            print("-" * 40)
            for word, weight in zip(topic_data['words'], topic_data['weights']):
                print(f"  {word:20s} {weight:8.4f}")

        # Display document-topic distribution
        print("\n" + "="*80)
        print("üìÑ ARTICLE-TOPIC DISTRIBUTIONS")
        print("="*80)

        topic_dist = self.get_document_topic_distribution(articles)

        for i, dist in enumerate(topic_dist):
            print(f"\nArticle {i}:")
            for topic_idx, prob in enumerate(dist):
                print(f"  Topic {topic_idx + 1}: {prob:6.2%}")
            dominant_topic = np.argmax(dist) + 1
            print(f"  ‚Üí Dominant Topic: Topic {dominant_topic}")

        print("\n" + "="*80)


# =============================================================================
# USAGE EXAMPLE
# =============================================================================

def analyze_stock_articles(articles_for_nlp, n_topics=3):
    """
    Main function to analyze stock articles using LDA

    Parameters:
    -----------
    articles_for_nlp : dict
        Dictionary containing articles with structure articles_for_nlp[i]['full_text']
    n_topics : int, default=3
        Number of topics to extract

    Returns:
    --------
    StockArticleLDA : Fitted LDA analyzer object
    """

    # Initialize analyzer
    analyzer = StockArticleLDA(n_topics=n_topics)

    # Extract and preprocess articles
    print("üîÑ Extracting and preprocessing articles...")
    print("-" * 80)
    preprocessed_articles = analyzer.extract_articles(articles_for_nlp, n_articles=5)

    if len(preprocessed_articles) < 2:
        print("‚ùå Error: Need at least 2 valid articles for LDA analysis")
        return None

    # Fit LDA model
    analyzer.fit_lda(preprocessed_articles, use_tfidf=True)

    # Display results
    analyzer.display_results(preprocessed_articles, n_words=10)

    return analyzer


# =============================================================================
# QUICK START - Uncomment to run
# =============================================================================

# Example usage (uncomment when ready to run):
# analyzer = analyze_stock_articles(articles_for_nlp, n_topics=3)

# To get topic keywords programmatically:
# topics = analyzer.get_top_words_per_topic(n_words=10)

# To experiment with different number of topics:
# for k in range(2, 5):
#     print(f"\n{'='*80}\nTesting with {k} topics\n{'='*80}")
#     analyzer = analyze_stock_articles(articles_for_nlp, n_topics=k)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
analyzer = analyze_stock_articles(articles_for_nlp, n_topics=3)

# To get topic keywords programmatically:
# topics = analyzer.get_top_words_per_topic(n_words=10)

# To experiment with different number of topics:
# for k in range(2, 5):
#     print(f"\n{'='*80}\nTesting with {k} topics\n{'='*80}")
#     analyzer = analyze_stock_articles(articles_for_nlp, n_topics=k)

üîÑ Extracting and preprocessing articles...
--------------------------------------------------------------------------------
‚úì Article 0 preprocessed (1212 characters)
‚úì Article 1 preprocessed (3292 characters)
‚úì Article 2 preprocessed (3614 characters)
‚úì Article 3 preprocessed (1079 characters)
‚úì Article 4 preprocessed (2639 characters)

üìä Vectorizing 5 articles...
‚úì Document-term matrix shape: (5, 1000)
‚úì Vocabulary size: 1000

üîç Fitting LDA with 3 topics...
‚úì LDA model fitted successfully
‚úì Log-likelihood: -509.73
‚úì Perplexity: 6776.12

üì∞ LDA TOPIC EXTRACTION RESULTS

Topic 1:
----------------------------------------
  weth                   0.7816
  usdt                   0.4824
  doge                   0.4824
  weeth                  0.4824
  wbtc                   0.4824
  usdc                   0.4824
  usd                    0.4824
  sol                    0.4824
  price                  0.4170
  dot pyusd              0.4077

Topic 2:
-----------