## 1. Word Count

In [None]:
import re
from collections import Counter

# 1. Define stopwords
STOPWORDS = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now",
    # Common scraping noise
    "said", "says", "mr", "mrs", "inc", "ltd", "corp", "image", "video", "market"
}

MIN_WORD_LEN = 3

def normalize_token(token: str) -> str:
    # Remove plural 's' only if word is long enough to avoid bugs like this->thi
    if token.endswith("s") and len(token) > 4:
        return token[:-1]
    return token

# 2. Group articles
items = _input.all()
groups = {}

for item in items:
    art = item.get("json", {})
    # Use query or seed as key
    keyword = art.get("keyword")
    keyword = str(keyword).strip().lower()
    groups.setdefault(keyword, []).append(art)

# 3. Analyze
output_items = []

for keyword, articles in groups.items():
    counter = Counter()

    for art in articles:
        # Combine title and content
        title = art.get("title") or ""
        content = art.get("content") or ""
        text = (title + " " + content).lower()

        if not text.strip(): continue

        # Tokenize using regex
        tokens = re.findall(r"[a-z]+", text)

        # Filter and normalize
        clean_tokens = []
        for t in tokens:
            if len(t) >= MIN_WORD_LEN and t not in STOPWORDS:
                clean_tokens.append(normalize_token(t))

        counter.update(clean_tokens)

    # Output as list of lists: [["word", count], ...]
    word_count = counter.most_common(20)

    output_items.append({
        "json": {
            "keyword": keyword,
            "article_count": len(articles),
            "word_count": word_count
        }
    })

return output_items

## 2. Scores Ranking

In [None]:
items = _input.all()
output_items = []

for item in items:
    group = item.get("json", {})
    keyword = group.get("keyword")
    articles = group.get("articles", [])

    # 1. First sort all articles by final_score
    ranked = sorted(articles, key=lambda a: a.get("final_score_100", 0), reverse=True)

    # 2. Filter: Only keep articles with final_score_100 >= 50
    ranked = [a for a in ranked if a.get("final_score_100", 0) >= 50]

    # 3. Then take the top 5 after filtering
    cleaned_articles = []
    for art in ranked[:5]:
        cleaned_articles.append({
            "title": art.get("title"),
            "url": art.get("url"),
            "content": art.get("content", ""),
            "final_score": art.get("final_score_100"),
            "sentiment_score": art.get("sentiment_polarization_score")
        })

    output_items.append({
        "json": {
            "keyword": keyword,
            "articles": cleaned_articles
        }
    })

return output_items

## 3. Entities

In [None]:
# Step 0: Flatten articles from all blocks

flat_items = []

for item in _input.all():
    block = item.get("json", item)
    articles = block.get("articles", [])
    for art in articles:
        flat_items.append(art)

# Step 1: Clean & Group by keyword
groups = {}

for data in flat_items:
    keyword = data.get("keyword")
    entities = data.get("mentioned_entities", [])
    score = data.get("final_score_100", 0)

    if not keyword or not entities or score < 40:
        continue

    clean_node = {
        "title": data.get("title"),
        "url": data.get("url"),
        "entities": entities,
        "score": score,
        "sentiment": data.get("sentiment_score", 0)
    }

    k_key = str(keyword).strip().lower()
    groups.setdefault(k_key, []).append(clean_node)

# Step 2: Output formatting
output_items = []

for k, nodes in groups.items():
    output_items.append({
        "json": {
            "keyword": k,
            "graph_data": nodes,
            "node_count": len(nodes)
        }
    })

return output_items


In [None]:
# --------------------------------------------------
# Graph Entity Cleaner & Normalizer
# --------------------------------------------------

# 1. Configuration: Entity Normalization Map
# Merge variations into a single canonical term
ENTITY_MAP = {
    "etfs": "ETF",
    "crypto": "Cryptocurrency",
    "cryptocurrencies": "Cryptocurrency",
    "crypto market": "Cryptocurrency",
    "federal reserve": "Fed",
    "us military": "US Government",
    "sec": "SEC",
    "securities and exchange commission": "SEC",
    "ai": "Artificial Intelligence",
    "stock market": "Equities",
    "options market": "Derivatives",
    "blackrock": "BlackRock",  # Ensure casing
}

# 2. Configuration: Generic Stopwords to Remove
# These words are too broad to form meaningful clusters
STOPWORDS = {
    "report", "analysts", "today", "price", "market", "sector", "industry",
    "investors", "growth", "demand", "supply", "news", "company"
}

items = _input.all()
output_items = []

for item in items:
    group = item.get("json", {})
    keyword = str(group.get("keyword", "")).lower().strip()

    # Add the main keyword itself to stopwords for this group
    # (e.g., remove "Bitcoin" node from "Bitcoin" keyword graph)
    current_stopwords = STOPWORDS.copy()
    current_stopwords.add(keyword)

    cleaned_articles = []

    for article in group.get("graph_data", []):
        original_entities = article.get("entities", [])
        clean_entities = set()

        for ent in original_entities:
            # Normalize: Lowercase for comparison
            ent_lower = str(ent).lower().strip()

            # Check 1: Is it the keyword itself?
            if ent_lower == keyword:
                continue

            # Check 2: Is it a generic stopword?
            if ent_lower in current_stopwords:
                continue

            # Check 3: Map to canonical name (e.g., "ETFs" -> "ETF")
            # Uses the map, otherwise Title Cases the original
            canonical = ENTITY_MAP.get(ent_lower)
            if not canonical:
                # Default formatting: Title Case (e.g., "ai stocks" -> "Ai Stocks")
                # Better: Use the original casing if available, or .title()
                canonical = ent

            clean_entities.add(canonical)

        # Update the article with cleaned entities
        # Convert set back to list
        article["entities"] = list(clean_entities)

        # Only keep articles that still have entities left
        if article["entities"]:
            cleaned_articles.append(article)

    # Update group
    group["graph_data"] = cleaned_articles
    group["node_count"] = len(cleaned_articles)

    output_items.append({"json": group})

return output_items

## 4. Alpha Quadrant

In [None]:
items = _input.all()
cleaned_rows = []

for item in items:
    # 1. Get the group data
    group = item.get("json", {})

    # 2. Get the list of articles from this group
    articles = group.get("articles", [])

    # 3. Iterate through articles to extract specific fields
    for art in articles:
        # Validation: Skip if essential plotting axes are missing
        if art.get("source_credibility") is None or art.get("materiality_score") is None:
            continue

        # 4. Create the clean object (Selecting only needed columns)
        # Flattening the structure: One dict per article
        row = {
            "keyword": art.get("keyword") or group.get("keyword"),
            "title": art.get("title"),
            "url": art.get("url"),
            "source_credibility": float(art.get("source_credibility")),
            "materiality_score": float(art.get("materiality_score")),
            "pickup_count": int(art.get("pickup_count") or 0),
            "sentiment_score": float(art.get("sentiment_score") or 0),
            "final_score": float(art.get("final_score_100") or 0)
        }

        cleaned_rows.append({"json": row})

return cleaned_rows

## 5. Sentiment Statistics

In [None]:
items = _input.all()
output_items = []

import math

for it in items:
    group = it.get("json", {})
    keyword = group.get("keyword")
    articles = group.get("articles", [])

    sentiments = []
    weights = []

    for art in articles:
        s = art.get("sentiment_score")
        if s is None:
            continue
        sentiments.append(float(s))
        weights.append(float(art.get("final_score", 0)) or 0.0)

    n = len(sentiments)

    if n == 0:
        summary = {
            "keyword": keyword,
            "n_articles": 0
        }
    else:
        sentiments_sorted = sorted(sentiments)
        mean_senti = sum(sentiments) / n
        median_senti = sentiments_sorted[n // 2] if n % 2 == 1 else \
            0.5 * (sentiments_sorted[n // 2 - 1] + sentiments_sorted[n // 2])

        var = sum((x - mean_senti) ** 2 for x in sentiments) / n
        std_senti = math.sqrt(var)

        total_weight = sum(weights)
        if total_weight > 0:
            weighted_mean_senti = sum(w * s for w, s in zip(weights, sentiments)) / total_weight
        else:
            weighted_mean_senti = mean_senti

        def ratio(cond):
            cnt = sum(1 for x in sentiments if cond(x))
            return cnt / n

        pct_strong_neg = ratio(lambda x: x < -0.5)
        pct_weak_neg   = ratio(lambda x: -0.5 <= x < -0.2)
        pct_neutral    = ratio(lambda x: -0.2 <= x <= 0.2)
        pct_weak_pos   = ratio(lambda x: 0.2 < x <= 0.5)
        pct_strong_pos = ratio(lambda x: x > 0.5)

        summary = {
          "keyword": keyword,
          "n_articles": n,
          "mean_senti": round(mean_senti, 4),
          "median_senti": round(median_senti, 4),
          "std_senti": round(std_senti, 4),
          "weighted_mean_senti": round(weighted_mean_senti, 4),

          "pct_strong_neg": f"{pct_strong_neg * 100:.2f}%",
          "pct_weak_neg": f"{pct_weak_neg * 100:.2f}%",
          "pct_neutral": f"{pct_neutral * 100:.2f}%",
          "pct_weak_pos": f"{pct_weak_pos * 100:.2f}%",
          "pct_strong_pos": f"{pct_strong_pos * 100:.2f}%"
        }

    output_items.append({"json": summary})

return output_items

In [None]:
import statistics

results = []

for item in items:
    data = item.get("json", item)
    keyword = data.get("keyword")
    articles = data.get("articles", [])

    if not articles:
        results.append({
            "json": {
                "keyword": keyword,
                "top_positive_article": None,
                "top_negative_article": None,
                "range_senti": None,
                "range_category": None,
                "range_summary": None
            }
        })
        continue

    # Collect valid float scores
    sentiments = []
    for a in articles:
        val = a.get("sentiment_score")
        if val is None:
            val = a.get("sentiment_score")
        
        if val is not None:
            sentiments.append(float(val))
    
    # Calculate Median for this group (Default to 0.0 if empty)
    if not sentiments:
        sentiments = [0.0]
        senti_median = 0.0
    else:
        senti_median = statistics.median(sentiments)

    # 1. Range calculation
    range_senti = max(sentiments) - min(sentiments)
    range_senti = round(range_senti, 4)

    if range_senti <= 0.3:
        range_category = "coherent"
        range_summary = "Sentiment is highly aligned across articles."
    elif range_senti <= 0.8:
        range_category = "divergent"
        range_summary = "Sentiment shows moderate divergence."
    else:
        range_category = "polarized"
        range_summary = "Sentiment is sharply polarized."

    # Helper: Get score safely, using Median for None values
    def get_senti_score(x):
        val = x.get("sentiment_score")
        if val is None:
            val = x.get("sentiment_score")
        
        # Use median if still None
        if val is None:
            return senti_median
        return float(val)

    # 2. Top Positive (Using Median fallback)
    pos = max(articles, key=get_senti_score)
    
    top_positive_clean = {
        "title": pos.get("title"),
        "url": pos.get("url"),
        "content": pos.get("content"),
        "sentiment_score": pos.get("sentiment_score"),
        "final_score": pos.get("final_score") if pos.get("final_score") is not None else pos.get("final_score_100"),
    }

    # 3. Top Negative (Using Median fallback)
    neg = min(articles, key=get_senti_score)
    
    top_negative_clean = {
        "title": neg.get("title"),
        "url": neg.get("url"),
        "content": neg.get("content") or neg.get("snippet"),
        "sentiment_score": neg.get("sentiment_score"),
        "final_score": neg.get("final_score") if neg.get("final_score") is not None else neg.get("final_score_100"),
    }

    results.append({
        "json": {
            "keyword": keyword,
            "top_positive_article": top_positive_clean,
            "top_negative_article": top_negative_clean,
            "range_senti": range_senti,
            "range_category": range_category,
            "range_summary": range_summary
        }
    })

return results