In [None]:
from newsapi import NewsApiClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from datetime import datetime, timedelta
import pandas as pd

class FinBERTSentimentAnalyzer:
    def __init__(self, api_key):
        # Initialize News API client
        self.newsapi = NewsApiClient(api_key=api_key)

    def fetch_news(self, phrase='Apple stock', days_back=30):
        # Get news articles from the last `days_back` days
        date_from = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        articles = self.newsapi.get_everything(
            q=phrase,
            from_param=date_from,
            language="en",
            sort_by="relevancy",
            page_size=100
        )
        titles = [article['title'] for article in articles['articles']]
        return pd.DataFrame(titles, columns=["Title"])

    def custom_tokenizer(self, text):
        # Keywords for increasing TF-IDF weight
        financial_keywords = ["stock", "shares", "record", "revenue", "CEO", "stake", "sell", "buy", "market", "growth"]
        deal_keywords = ["deal", "discount", "Prime Day", "sale", "price", "clearance", "promotion"]

        # Tokenization with keyword weighting
        tokens = text.lower().split()
        custom_tokens = []
        for token in tokens:
            if token in financial_keywords:
                custom_tokens.extend([token] * 3)  # Boost weight for financial keywords
            elif token in deal_keywords:
                custom_tokens.extend([token] * 2)  # Boost weight for deal keywords
            else:
                custom_tokens.append(token)
        return custom_tokens

    def perform_clustering(self, titles_df, num_clusters=5):
        # Vectorize with TF-IDF, using custom tokenizer for weighted keywords
        vectorizer = TfidfVectorizer(tokenizer=self.custom_tokenizer)
        X = vectorizer.fit_transform(titles_df["Title"])

        # Apply K-means clustering
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        labels = kmeans.fit_predict(X)

        # Calculate silhouette score
        sil_score = silhouette_score(X, labels)
        print("Silhouette Score for K-means clustering:", sil_score)

        # Add clustering results to DataFrame
        titles_df['Topic'] = labels
        return titles_df, sil_score

    def display_topics(self, titles_df, num_clusters):
        for topic in range(num_clusters):
            print(f"\nTopic {topic}:")
            topic_titles = titles_df[titles_df['Topic'] == topic]['Title'].tolist()
            for title in topic_titles:
                print(f" - {title}")


api_key = "API_KEY"
analyzer = FinBERTSentimentAnalyzer(api_key)

news_df = analyzer.fetch_news(phrase='Apple stock', days_back=30)

num_clusters = 5
clustered_df, silhouette_score = analyzer.perform_clustering(news_df, num_clusters=num_clusters)

analyzer.display_topics(clustered_df, num_clusters)


Silhouette Score for K-means clustering: 0.025271760806722412

Topic 0:
 - Amazon is clearing its stock: Seize the best-selling AirPods Pro 2 at a massive discount before they vanish
 - Apple iPhone 16 Pro Max review: A towering success
 - iOS and Android Security Scare: Two Apps Found Supporting 'Pig Butchering' Scheme
 - How the iPhone 16 Camera Control Button Changes How You Take iPhone Photos
 - Apple Intelligence will drive iPhone 16 sales but isn't a major factor yet
 - Apple severely cuts Vision Pro production, and may stop it completely very soon
 - M4 MacBook Pro production appears well underway, as launch expected soon
 - Month-end iPad sale: $298 10th Gen, M4 iPad Pro $899, iPad mini 7 deals
 - Apple projector could display AR and VR content without a headset
 - Nearly every teenager still has an iPhone, and most have AirPods too
 - Apples Have Never Tasted So Delicious. Here’s Why
 - Designers reimagine iPod Touch in iPhone 16 design language

Topic 1:
 - Best Prime Day Mon

