# Clip-Searcher Pipeline & App

This notebook runs the complete Clip-Searcher pipeline:
1. Scrape news articles using DuckDuckGo
2. Process with NLP (entity extraction + coreference)
3. Build relationship graph
4. Visualize with PyVis or launch Gradio app

Data is saved to `data/raw/` between steps.

In [None]:
# Configuration
SEARCH_QUERY = "artificial intelligence"
ARTICLE_COUNT = 20

## Step 1: Scraping

In [None]:
from ddgs import DDGS
from newspaper import Article
from typing import List, Optional, Dict, Set, Tuple, Any
from pathlib import Path
import pandas as pd
import hashlib
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

RAW_DATA_DIR = Path('../data/raw')
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
def search_news(query: str, count: int = 50) -> List[str]:
    """Search for news articles using DuckDuckGo."""
    urls = []
    try:
        with DDGS() as ddgs:
            results = ddgs.news(query, max_results=count, backend='auto')
            urls = [r['url'] for r in results if r.get('url')]
        logger.info(f"Found {len(urls)} URLs for query: {query}")
    except Exception as e:
        logger.error(f"Search failed: {e}")
    return urls

def download_article(url: str) -> Optional[dict]:
    """Download and parse a single article."""
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {'url': url, 'title': article.title, 'text': article.text}
    except Exception as e:
        logger.warning(f"Failed to download {url}: {e}")
        return None

def download_articles(urls: List[str]) -> List[dict]:
    """Download multiple articles."""
    articles = [a for url in urls if (a := download_article(url)) and a.get('text')]
    logger.info(f"Downloaded {len(articles)}/{len(urls)} articles")
    return articles

def save_urls_to_raw(urls: List[str], query: str) -> Path:
    """Save URLs to parquet."""
    df = pd.DataFrame({'url': urls, 'query': query})
    filepath = RAW_DATA_DIR / 'urls.parquet'
    df.to_parquet(filepath, index=False)
    logger.info(f"Saved {len(urls)} URLs to {filepath}")
    return filepath

def save_articles_to_raw(articles: List[dict]) -> Path:
    """Save articles to txt files and metadata to parquet."""
    articles_dir = RAW_DATA_DIR / 'articles'
    articles_dir.mkdir(exist_ok=True)
    
    metadata = []
    for article in articles:
        url_hash = hashlib.md5(article['url'].encode()).hexdigest()[:12]
        txt_path = articles_dir / f"{url_hash}.txt"
        txt_path.write_text(article['text'], encoding='utf-8')
        metadata.append({'url': article['url'], 'title': article['title'], 'filename': f"{url_hash}.txt"})
    
    pd.DataFrame(metadata).to_parquet(RAW_DATA_DIR / 'articles_metadata.parquet', index=False)
    logger.info(f"Saved {len(articles)} articles to {articles_dir}")
    return articles_dir

In [None]:
# Run scraping
urls = search_news(SEARCH_QUERY, count=ARTICLE_COUNT)
save_urls_to_raw(urls, SEARCH_QUERY)
articles = download_articles(urls)
save_articles_to_raw(articles)
print(f"Scraped and saved {len(articles)} articles")

## Step 2: NLP Processing

In [None]:
import spacy
import coreferee
from nltk.tokenize import sent_tokenize
import nltk
from dataclasses import dataclass

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

@dataclass
class Entity:
    name: str
    label: str
    sent_idx: int
    start: int
    end: int
    url: str
    
    def to_dict(self) -> dict:
        return {'name': self.name, 'label': self.label, 'sent_idx': self.sent_idx,
                'start': self.start, 'end': self.end, 'urls': self.url}

class NLPProcessor:
    def __init__(self, model: str = 'en_core_web_lg'):
        self.nlp = spacy.load(model)
        self.nlp.add_pipe('coreferee')
        self.target_labels = {'PERSON', 'ORG'}
    
    def clean_entity_name(self, text: str) -> str:
        return text.replace('\n', ' ').replace("'s", "").strip()
    
    def extract_entities(self, sentence: str, sent_idx: int, url: str) -> List[Entity]:
        doc = self.nlp(sentence)
        entities = []
        for ent in doc.ents:
            if ent.label_ in self.target_labels:
                name = self.clean_entity_name(ent.text)
                if name:
                    entities.append(Entity(name, ent.label_, sent_idx, ent.start, ent.end, url))
        return entities
    
    def get_coreference_chains(self, doc) -> List[List[int]]:
        chains = []
        if doc._.coref_chains:
            for chain in doc._.coref_chains:
                indices = []
                for mention in chain:
                    indices.extend(mention.token_indexes)
                chains.append(indices)
        return chains
    
    def get_sentence_for_token(self, doc, token_idx: int) -> int:
        for sent_idx, sent in enumerate(doc.sents):
            if sent.start <= token_idx < sent.end:
                return sent_idx
        return -1

def load_articles_from_raw() -> List[dict]:
    """Load articles from data/raw."""
    metadata_path = RAW_DATA_DIR / 'articles_metadata.parquet'
    articles_dir = RAW_DATA_DIR / 'articles'
    
    if not metadata_path.exists():
        return []
    
    meta_df = pd.read_parquet(metadata_path)
    articles = []
    for _, row in meta_df.iterrows():
        txt_path = articles_dir / row['filename']
        if txt_path.exists():
            articles.append({'url': row['url'], 'title': row['title'], 'text': txt_path.read_text(encoding='utf-8')})
    return articles

In [None]:
# Run NLP processing
articles = load_articles_from_raw()
processor = NLPProcessor()

all_entities = []
all_coref_chains = []

for article in articles:
    if not article.get('text'):
        continue
    
    url = article['url']
    text = article['text']
    doc = processor.nlp(text)
    sentences = sent_tokenize(text)
    
    for sent_idx, sentence in enumerate(sentences):
        for ent in processor.extract_entities(sentence, sent_idx, url):
            all_entities.append(ent.to_dict())
    
    chains = processor.get_coreference_chains(doc)
    for chain_idx, chain in enumerate(chains):
        chain_sents = set()
        for token_idx in chain:
            sent_idx = processor.get_sentence_for_token(doc, token_idx)
            if sent_idx >= 0:
                chain_sents.add(sent_idx)
        if len(chain_sents) > 1:
            all_coref_chains.append({'chain_idx': chain_idx, 'sentences': list(chain_sents), 'url': url})

print(f"Extracted {len(all_entities)} entities and {len(all_coref_chains)} coreference chains")

## Step 3: Graph Building

In [None]:
from itertools import combinations

@dataclass
class Edge:
    source: str
    target: str
    edge_type: str
    
    def to_dict(self) -> dict:
        return {'source': self.source, 'target': self.target, 'type': self.edge_type}
    
    def __hash__(self):
        return hash(frozenset([self.source, self.target]))
    
    def __eq__(self, other):
        if not isinstance(other, Edge):
            return False
        return frozenset([self.source, self.target]) == frozenset([other.source, other.target])

def is_valid_pair(e1: dict, e2: dict) -> bool:
    if e1['name'] == e2['name'] or not e1['name'] or not e2['name']:
        return False
    return not (e1['label'] == 'ORG' and e2['label'] == 'ORG')

def extract_edges_from_entities(entities: List[dict], coref_chains: List[dict]) -> Set[Edge]:
    edges = set()
    by_sentence: Dict[Tuple[str, int], List[dict]] = {}
    for ent in entities:
        key = (ent['urls'], ent['sent_idx'])
        by_sentence.setdefault(key, []).append(ent)
    
    # Same-sentence edges
    for ents in by_sentence.values():
        if len(ents) >= 2:
            for e1, e2 in combinations(ents, 2):
                if is_valid_pair(e1, e2):
                    edges.add(Edge(e1['name'], e2['name'], f"{e1['label']}-{e2['label']}"))
    
    # Coreference edges
    for chain in coref_chains:
        chain_entities = []
        for sent_idx in chain['sentences']:
            key = (chain['url'], sent_idx)
            chain_entities.extend(by_sentence.get(key, []))
        
        if len(chain_entities) >= 2:
            for e1, e2 in combinations(chain_entities, 2):
                if is_valid_pair(e1, e2):
                    edges.add(Edge(e1['name'], e2['name'], f"{e1['label']}-{e2['label']}"))
    
    return edges

In [None]:
# Build graph
edges = extract_edges_from_entities(all_entities, all_coref_chains)
edges_df = pd.DataFrame([e.to_dict() for e in edges]) if edges else pd.DataFrame(columns=['source', 'target', 'type'])
print(f"Built graph with {len(edges_df)} edges")
edges_df.head(10)

## Step 4: Visualization

In [None]:
import networkx as nx
from pyvis.network import Network

ENTITY_COLORS = {
    'PERSON': '#4CAF50', 'ORG': '#2196F3',
    'PERSON-ORG': '#FF9800', 'ORG-PERSON': '#FF9800', 'PERSON-PERSON': '#9C27B0'
}

def infer_node_types(edges_df: pd.DataFrame) -> Dict[str, str]:
    node_types = {}
    for _, row in edges_df.iterrows():
        if '-' in str(row.get('type', '')):
            t1, t2 = row['type'].split('-')
            node_types.setdefault(row['source'], t1)
            node_types.setdefault(row['target'], t2)
    return node_types

def create_visualization(edges_df: pd.DataFrame) -> Network:
    G = nx.from_pandas_edgelist(edges_df, 'source', 'target', edge_attr='type')
    node_types = infer_node_types(edges_df)
    
    net = Network(height='600px', width='100%', bgcolor='#222222', font_color='white',
                  notebook=True, cdn_resources='in_line')
    
    for node in G.nodes():
        ntype = node_types.get(node, 'PERSON')
        net.add_node(node, label=node, color=ENTITY_COLORS.get(ntype, '#FFFFFF'),
                     title=f"{node} ({ntype})")
    
    for s, t, data in G.edges(data=True):
        etype = data.get('type', '')
        net.add_edge(s, t, color=ENTITY_COLORS.get(etype, '#888888'))
    
    net.repulsion(node_distance=420, central_gravity=0.33, spring_length=110,
                  spring_strength=0.10, damping=0.95)
    return net

In [None]:
# Create and display visualization
if not edges_df.empty:
    net = create_visualization(edges_df)
    net.show('graph.html')
else:
    print("No edges to visualize")

In [None]:
# Save edges for app
edges_df.to_parquet(RAW_DATA_DIR / 'edges.parquet', index=False)
print(f"Saved edges to {RAW_DATA_DIR / 'edges.parquet'}")

## Gradio App (Optional)

Launch an interactive web interface.

In [None]:
import gradio as gr
import tempfile

def create_graph_html(edges_df: pd.DataFrame) -> str:
    """Create PyVis graph and return HTML content."""
    if edges_df.empty:
        return "<h3>No relationships found</h3>"
    
    G = nx.from_pandas_edgelist(edges_df, 'source', 'target', edge_attr='type')
    node_types = infer_node_types(edges_df)
    
    net = Network(height='600px', width='100%', bgcolor='#222222', font_color='white',
                  notebook=False, cdn_resources='in_line')
    
    for node in G.nodes():
        ntype = node_types.get(node, 'PERSON')
        net.add_node(node, label=node, color=ENTITY_COLORS.get(ntype, '#FFFFFF'),
                     title=f"{node} ({ntype})")
    
    for s, t, data in G.edges(data=True):
        etype = data.get('type', '')
        net.add_edge(s, t, color=ENTITY_COLORS.get(etype, '#888888'))
    
    net.repulsion(node_distance=420, central_gravity=0.33, spring_length=110,
                  spring_strength=0.10, damping=0.95)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        net.save_graph(f.name)
        return Path(f.name).read_text()

In [None]:
# Singleton NLP processor for Gradio
_nlp_processor = None

def get_processor():
    global _nlp_processor
    if _nlp_processor is None:
        _nlp_processor = NLPProcessor()
    return _nlp_processor

def run_pipeline(query: str, count: int, progress=gr.Progress()) -> str:
    """Run the full pipeline and return graph HTML."""
    progress(0.1, desc="Searching DuckDuckGo News...")
    urls = search_news(query, count=count)
    
    progress(0.3, desc=f"Downloading {len(urls)} articles...")
    articles = download_articles(urls)
    
    if not articles:
        return "<h3>No articles could be downloaded</h3>"
    
    progress(0.5, desc="Processing with NLP...")
    proc = get_processor()
    all_entities, all_coref_chains = [], []
    
    for article in articles:
        if not article.get('text'):
            continue
        url, text = article['url'], article['text']
        doc = proc.nlp(text)
        
        for sent_idx, sentence in enumerate(sent_tokenize(text)):
            for ent in proc.extract_entities(sentence, sent_idx, url):
                all_entities.append(ent.to_dict())
        
        for chain_idx, chain in enumerate(proc.get_coreference_chains(doc)):
            chain_sents = {proc.get_sentence_for_token(doc, t) for t in chain}
            chain_sents.discard(-1)
            if len(chain_sents) > 1:
                all_coref_chains.append({'chain_idx': chain_idx, 'sentences': list(chain_sents), 'url': url})
    
    progress(0.8, desc="Building graph...")
    edges = extract_edges_from_entities(all_entities, all_coref_chains)
    edges_df = pd.DataFrame([e.to_dict() for e in edges]) if edges else pd.DataFrame(columns=['source', 'target', 'type'])
    edges_df.to_parquet(RAW_DATA_DIR / 'edges.parquet', index=False)
    
    progress(0.9, desc="Creating visualization...")
    return create_graph_html(edges_df)

def load_existing_graph() -> str:
    """Load pre-computed graph from data/raw."""
    edges_path = RAW_DATA_DIR / 'edges.parquet'
    if not edges_path.exists():
        return "<h3>No pre-computed graph found. Run the pipeline first.</h3>"
    return create_graph_html(pd.read_parquet(edges_path))

In [None]:
# Launch Gradio app
with gr.Blocks(title="clip-search.ai", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🔍 clip-search.ai")
    gr.Markdown("Entity Relationship Graph Generator from News Articles")
    
    with gr.Tab("Search & Generate"):
        with gr.Row():
            query_input = gr.Textbox(label="Search Query", placeholder="e.g., artificial intelligence")
            count_input = gr.Slider(minimum=10, maximum=100, value=20, step=10, label="Number of Articles")
        search_btn = gr.Button("🚀 Search and Build Graph", variant="primary")
        graph_output = gr.HTML(label="Entity Relationship Graph")
        search_btn.click(fn=run_pipeline, inputs=[query_input, count_input], outputs=graph_output)
    
    with gr.Tab("Load Existing"):
        gr.Markdown("Load a previously generated graph from `data/raw/edges.parquet`")
        load_btn = gr.Button("📂 Load Graph", variant="secondary")
        existing_graph = gr.HTML(label="Loaded Graph")
        load_btn.click(fn=load_existing_graph, outputs=existing_graph)

app.launch()