
# The Futures Edit — Signals Pipeline (GLOBAL · PLUS)
- Clear **status** tiers: *emerging / bubbling / mainstream* (percentile-based)
- **More keywords**: big multilingual seed lists + **YAKE** auto-extraction


In [None]:
# ⬇️ Install dependencies (run once per Colab session)
!pip install -q feedparser beautifulsoup4 pandas numpy nltk pytrends praw tqdm python-dateutil pyyaml yake langdetect
print("✅ Dependencies installed")

In [None]:
# (Optional) Mount Google Drive to save outputs
try:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    SAVE_DIR = "/content/drive/MyDrive/futures_edit_outputs"
    os.makedirs(SAVE_DIR, exist_ok=True)
    print("Drive mounted. Outputs will also be saved to:", SAVE_DIR)
except Exception as e:
    print("Drive not mounted (you can ignore this).")

In [None]:
# 🗂 Create sources_config.yaml (GLOBAL + bigger multilingual seeds)
yaml_content = """rss:
  fashion:
    - https://news.google.com/rss/search?q=fashion%20innovation%20OR%20runway%20OR%20couture&hl=en&gl=US&ceid=US:en
    - https://news.google.com/rss/search?q=moda%20tendencias%20OR%20pasarela%20OR%20alta%20costura&hl=es&gl=ES&ceid=ES:es
    - https://news.google.com/rss/search?q=mode%20tendances%20OR%20haute%20couture%20OR%20d%C3%A9fil%C3%A9&hl=fr&gl=FR&ceid=FR:fr
    - https://www.hypebeast.com/feed
    - https://www.highsnobiety.com/feed
    - https://www.dazeddigital.com/rss
    - https://i-d.vice.com/en_uk/rss
    - https://theimpression.com/feed/
  beauty:
    - https://news.google.com/rss/search?q=beauty%20skincare%20innovation%20OR%20makeup%20trend&hl=en&gl=US&ceid=US:en
    - https://news.google.com/rss/search?q=belleza%20cuidado%20de%20la%20piel%20tendencia%20maquillaje&hl=es&gl=ES&ceid=ES:es
    - https://news.google.com/rss/search?q=beauti%C3%A9%20soin%20de%20la%20peau%20tendance%20maquillage&hl=fr&gl=FR&ceid=FR:fr
    - https://www.beautymatter.com/rss.xml
    - https://www.glossy.co/feed/
    - https://www.allure.com/feed/all/rss
    - https://www.refinery29.com/en-us/rss.xml
  wellness:
    - https://news.google.com/rss/search?q=wellness%20trend%20OR%20longevity%20OR%20recovery%20studio&hl=en&gl=US&ceid=US:en
    - https://news.google.com/rss/search?q=bienestar%20tendencias%20salud%20long%20evitad&hl=es&gl=ES&ceid=ES:es
    - https://news.google.com/rss/search?q=bien-%C3%AAtre%20tendances%20sant%C3%A9%20long%C3%A9vit%C3%A9&hl=fr&gl=FR&ceid=FR:fr
    - https://www.wellandgood.com/feed/
    - https://www.mindbodygreen.com/feeds/latest
    - https://examine.com/feed/

keywords:
  fashion:
    - quiet luxury
    - stealth wealth
    - lujo silencioso
    - luxe discret
    - balletcore
    - coquette
    - blokette
    - mob wife aesthetic
    - archival revival
    - gorpcore
    - techwear
    - digital couture
    - haute couture numérique
    - generative knitwear
    - 3D knit
    - crochet runway
    - upcycled denim
    - deadstock
    - seasonless drops
    - drop model
    - rental fashion
    - circular design
    - luxury resale
    - made-to-order
    - on-demand production
    - mary janes
    - ballet flats
    - kitten heels
    - rosette
    - bows
    - ribbon
  beauty:
    - skin cycling
    - skin flooding
    - skin barrier
    - slugging
    - glass skin
    - jello skin
    - mochi skin
    - latte makeup
    - lip oil
    - lip stain
    - peptide serum
    - copper peptides
    - fermented beauty
    - microbiome skincare
    - mushroom skincare
    - adaptogens
    - SPF stick
    - mineral sunscreen
    - LED mask
    - red light mask
    - microcurrent device
    - AI skin analysis
    - scalp care
    - hair cycling
    - fragrance layering
    - perfume oils
  wellness:
    - longevity
    - GLP-1
    - red light therapy
    - cold plunge
    - sauna club
    - heat therapy
    - breathwork
    - vagus nerve
    - magnesium glycinate
    - creatine for women
    - protein water
    - gut health
    - microbiome
    - prebiotics
    - postbiotics
    - electrolytes
    - sleep hygiene
    - mouth taping
    - HRV tracking
    - zone 2
    - rucking
    - pilates
    - mobility
    - lymphatic drainage
"""
with open("sources_config.yaml", "w", encoding="utf-8") as f:
    f.write(yaml_content)
print("✅ sources_config.yaml created (GLOBAL + extended keywords)")

In [None]:
import os, re, time, math, json, yaml, feedparser, pandas as pd, numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from urllib.parse import urlparse
from dateutil import parser as dateparser
from tqdm import tqdm

# Optional Trends/Reddit
try:
    from pytrends.request import TrendReq
except Exception:
    TrendReq = None

try:
    import praw
except Exception:
    praw = None

# Keyword extraction
from langdetect import detect
import yake

with open("sources_config.yaml", "r", encoding="utf-8") as f:
    CFG = yaml.safe_load(f)

OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)
NOW = datetime.now(timezone.utc)

print("Categories:", list(CFG.get("rss", {}).keys()))

## 1) RSS Ingestion

In [None]:
def clean_html(text: str) -> str:
    soup = BeautifulSoup(text or "", "html.parser")
    return soup.get_text(" ", strip=True)

def parse_date(entry):
    for key in ("published", "updated", "created"):
        val = entry.get(key)
        if val:
            try:
                return dateparser.parse(val)
            except Exception:
                pass
    return None

def fetch_rss_feeds(feed_urls, category):
    rows = []
    for url in tqdm(feed_urls, desc=f"Fetching {category} feeds"):
        try:
            d = feedparser.parse(url)
            for e in d.entries:
                title = e.get("title", "").strip()
                link = e.get("link")
                summary = clean_html(e.get("summary", ""))
                dt = parse_date(e)
                rows.append({
                    "title": title,
                    "link": link,
                    "summary": summary,
                    "published": dt.isoformat() if dt else None,
                    "source": urlparse(url).netloc,
                    "category": category
                })
        except Exception as ex:
            print(f"Error fetching {url}: {ex}")
    return pd.DataFrame(rows)

frames = []
for cat in ("fashion","beauty","wellness"):
    feeds = CFG.get("rss",{}).get(cat, [])
    if feeds:
        frames.append(fetch_rss_feeds(feeds, cat))

rss_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
rss_df.to_csv(os.path.join(OUT_DIR, "rss_raw.csv"), index=False)
print(f"Fetched {len(rss_df)} items")
rss_df.head(8)

## 2) Scoring + Automatic Keyword Extraction (YAKE)

In [None]:
SEEDS = CFG.get("keywords", {})

def recency_score(published_iso):
    if not published_iso:
        return 0.5
    try:
        dt = dateparser.parse(published_iso)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        else:
            dt = dt.astimezone(timezone.utc)
        days = max((NOW - dt).days, 0)
        return max(0.0, 1.0 - (days/30.0))
    except Exception:
        return 0.5

def keyword_hit_score(text, seeds):
    text_l = (text or "").lower()
    score = 0
    hits = []
    for kw in seeds:
        if kw.lower() in text_l:
            score += 1
            hits.append(kw)
    return score, hits

# Multilingual YAKE extraction
def extract_auto_keyword(title, summary, topk=1):
    raw = " ".join([str(title or ""), str(summary or "")]).strip()
    if not raw:
        return ""
    # Detect language; fallback to 'en'
    try:
        lang = detect(raw)
        if lang not in ("en","es","fr"):
            lang = "en"
    except Exception:
        lang = "en"
    try:
        kw = yake.KeywordExtractor(lan=lang, n=3, top=topk)
        pairs = kw.extract_keywords(raw)
        if not pairs:
            return ""
        # YAKE returns (keyphrase, score) where lower score = more relevant
        best = sorted(pairs, key=lambda x: x[1])[0][0]
        return best.strip().lower()
    except Exception:
        return ""

rows = []
for _, row in rss_df.iterrows():
    seeds = SEEDS.get(row["category"], [])
    kscore_title, hits_t = keyword_hit_score(row["title"], seeds)
    kscore_sum, hits_s = keyword_hit_score(row["summary"], seeds)
    rec = recency_score(row["published"])
    auto_kw = extract_auto_keyword(row["title"], row["summary"], topk=1)
    primary_kw = (hits_t + hits_s)[0] if (hits_t or hits_s) else auto_kw
    score = 0.7*(kscore_title*1.5 + kscore_sum) + 0.3*(rec*2.0)
    rows.append({
        **row,
        "keyword_hits": "; ".join(sorted(set(hits_t + hits_s))),
        "auto_keyword": auto_kw,
        "primary_keyword": primary_kw,
        "recency": rec,
        "trend_score": round(score, 3)
    })

scored_df = pd.DataFrame(rows).sort_values("trend_score", ascending=False)
scored_df.to_csv(os.path.join(OUT_DIR, "rss_scored_plus.csv"), index=False)
scored_df.head(12)

## 3) Google Trends Validation (optional)

In [None]:
def trends_score(terms, geo="ES", timeframe="today 3-m"):
    if TrendReq is None:
        print("pytrends not available; skipping Trends.")
        return {}
    pytrends = TrendReq(hl='en-US', tz=360)
    results = {}
    for term in terms:
        if not term:
            continue
        try:
            pytrends.build_payload([term], timeframe=timeframe, geo=geo)
            df = pytrends.interest_over_time()
            if not df.empty:
                tail = df[term].tail(12)
                score = (tail.iloc[-1] + 1) / (tail.mean() + 1)
                results[term] = round(float(score), 3)
        except Exception as e:
            results[term] = None
    return results

top_terms = (
    scored_df["primary_keyword"]
    .dropna()
    .astype(str)
    .str.strip()
    .value_counts()
    .head(10)
    .index
    .tolist()
)

trends = trends_score(top_terms, geo="ES", timeframe="today 3-m")
trends

## 4) Reddit Mentions (optional)

In [None]:
def reddit_client():
    if praw is None:
        print("praw not available; skipping Reddit.")
        return None
    import os
    cid = os.getenv("REDDIT_CLIENT_ID", "")
    csec = os.getenv("REDDIT_CLIENT_SECRET", "")
    ua = os.getenv("REDDIT_USER_AGENT", "futures-edit-scout/0.1")
    if not cid or not csec:
        print("Missing Reddit credentials; skipping Reddit mentions.")
        return None
    return praw.Reddit(client_id=cid, client_secret=csec, user_agent=ua)

def count_reddit_mentions(terms, subreddits, limit=80):
    reddit = reddit_client()
    if reddit is None:
        return {}
    res = {t:0 for t in terms if t}
    for sub in subreddits:
        try:
            for post in reddit.subreddit(sub).new(limit=limit):
                text = f"{post.title} {post.selftext or ''}".lower()
                for t in res.keys():
                    if t and t.lower() in text:
                        res[t] += 1
        except Exception as e:
            print("Reddit error on r/%s: %s" % (sub, e))
    return res

# Default subs if none provided
subs_cfg = CFG.get("reddit",{}).get("subreddits",{})
default_subs = list(set(sum(subs_cfg.values(), []))) if subs_cfg else [
    "SkincareAddiction","AsianBeauty","MakeUpAddiction","fragrance",
    "femalefashionadvice","malefashionadvice","techwearclothing",
    "Supplements","Biohackers","meditation"
]
reddit_counts = count_reddit_mentions(top_terms, default_subs, limit=60)
reddit_counts

## 5) Merge Validation + Export (with status tiers)

In [None]:
def as_float(x):
    try:
        return float(x)
    except:
        return 0.0

final = scored_df.copy()
final["trends_momentum"] = final["primary_keyword"].map(trends).fillna("")
final["reddit_mentions"] = final["primary_keyword"].map(reddit_counts).fillna("")

# Composite rank
if len(final):
    rmax = max([as_float(x) for x in final["reddit_mentions"] if str(x)!=""], default=1.0)
    final["reddit_norm"] = final["reddit_mentions"].apply(lambda x: as_float(x) / (rmax or 1.0))
    final["momentum_norm"] = final["trends_momentum"].apply(as_float)
    final["signal_rank"] = (final["trend_score"] + 0.5*final["reddit_norm"] + 0.3*final["momentum_norm"]).round(3)
    final = final.sort_values("signal_rank", ascending=False)

# Percentile thresholds for status
if len(final):
    k = final["signal_rank"].astype(float)
    p33 = np.percentile(k, 33)
    p66 = np.percentile(k, 66)
    def status_for(v):
        if v >= p66: return "mainstream"
        if v >= p33: return "bubbling"
        return "emerging"
    final["status"] = final["signal_rank"].apply(status_for)
else:
    p33 = p66 = 0.0
    final["status"] = ""

final_out = final[["title","link","summary","published","source","category","keyword_hits","auto_keyword","primary_keyword","trend_score","trends_momentum","reddit_mentions","signal_rank","status"]]
final_out.to_csv(os.path.join(OUT_DIR, "signals_ranked_plus.csv"), index=False)
print("✅ Exported → outputs/signals_ranked_plus.csv")
print("Status thresholds → emerging < %.3f ≤ bubbling < %.3f ≤ mainstream" % (p33, p66))
final_out.head(15)

## 6) Newsletter Capsules (prefer primary_keyword, fallback to auto_keyword)

In [None]:
def to_capsule(row):
    name = row["primary_keyword"] or row["auto_keyword"] or (row["title"][:60] + ("…" if len(row["title"])>60 else ""))
    why = "This hints at a shift toward " + (row["primary_keyword"] or row["auto_keyword"] or "an emerging consumer desire")
    return {
        "Signal Name": name,
        "Category": row["category"],
        "Example Link": row["link"],
        "Why it matters": why,
        "Score": row.get("signal_rank",""),
        "Status": row.get("status","")
    }

caps_df = pd.DataFrame([to_capsule(r) for _, r in final_out.iterrows()])
caps_df.to_csv(os.path.join(OUT_DIR, "newsletter_capsules_plus.csv"), index=False)
print("✅ Exported → outputs/newsletter_capsules_plus.csv")
caps_df.head(10)

## 7) Copy outputs to Google Drive (if mounted)

In [None]:
import glob, shutil
if 'SAVE_DIR' in globals() and os.path.isdir(OUT_DIR):
    for f in glob.glob(os.path.join(OUT_DIR, "*.csv")):
        shutil.copy(f, SAVE_DIR)
    print("✅ Copied CSVs to", SAVE_DIR)
else:
    print("Drive not mounted or no outputs yet.")