# NLP Processing Module

This notebook contains NLP utilities for entity extraction and coreference resolution using spaCy and Coreferee.
Reads article data from `data/raw/`.

In [None]:
import spacy
import coreferee
from nltk.tokenize import sent_tokenize
import nltk
from typing import List, Dict, Tuple, Any
from dataclasses import dataclass, field
from pathlib import Path
import pandas as pd
import logging

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Data directories
RAW_DATA_DIR = Path('../data/raw')

In [None]:
def load_articles_from_raw() -> List[dict]:
    """
    Load articles from data/raw directory.
    
    Returns:
        List of article dictionaries with 'url' and 'text' keys
    """
    metadata_path = RAW_DATA_DIR / 'articles_metadata.parquet'
    articles_dir = RAW_DATA_DIR / 'articles'
    
    if not metadata_path.exists():
        logger.warning(f"No metadata found at {metadata_path}")
        return []
    
    meta_df = pd.read_parquet(metadata_path)
    articles = []
    
    for _, row in meta_df.iterrows():
        txt_path = articles_dir / row['filename']
        if txt_path.exists():
            text = txt_path.read_text(encoding='utf-8')
            articles.append({
                'url': row['url'],
                'title': row['title'],
                'text': text
            })
    
    logger.info(f"Loaded {len(articles)} articles from {articles_dir}")
    return articles

In [None]:
@dataclass
class Entity:
    """Represents an extracted named entity."""
    name: str
    label: str
    sent_idx: int
    start: int
    end: int
    url: str
    
    def to_dict(self) -> dict:
        return {
            'name': self.name,
            'label': self.label,
            'sent_idx': self.sent_idx,
            'start': self.start,
            'end': self.end,
            'urls': self.url
        }


@dataclass
class CoreferenceChain:
    """Represents a coreference chain."""
    chain_idx: int
    mentions: List[Tuple[int, int]]  # (token_start, sent_idx)
    url: str

In [None]:
class NLPProcessor:
    """
    NLP processor using spaCy and Coreferee for entity extraction
    and coreference resolution.
    """
    
    def __init__(self, model: str = 'en_core_web_lg'):
        """
        Initialize NLP processor.
        
        Args:
            model: spaCy model name (use 'lg' or 'trf' for better accuracy)
        """
        self.nlp = spacy.load(model)
        self.nlp.add_pipe('coreferee')
        self.target_labels = {'PERSON', 'ORG'}
    
    def clean_entity_name(self, text: str) -> str:
        """Clean up entity name text."""
        return text.replace('\n', ' ').replace("'s", "").strip()
    
    def extract_entities_from_sentence(
        self, 
        sentence: str, 
        sent_idx: int, 
        url: str
    ) -> List[Entity]:
        """
        Extract PERSON and ORG entities from a sentence.
        
        Args:
            sentence: Text of the sentence
            sent_idx: Index of the sentence in the document
            url: Source URL
        
        Returns:
            List of Entity objects
        """
        doc = self.nlp(sentence)
        entities = []
        
        for ent in doc.ents:
            if ent.label_ in self.target_labels:
                name = self.clean_entity_name(ent.text)
                if name:  # Skip empty names
                    entities.append(Entity(
                        name=name,
                        label=ent.label_,
                        sent_idx=sent_idx,
                        start=ent.start,
                        end=ent.end,
                        url=url
                    ))
        
        return entities
    
    def get_coreference_chains(self, doc) -> List[List[int]]:
        """
        Extract coreference chains from a spaCy doc.
        
        Args:
            doc: spaCy Doc object
        
        Returns:
            List of chains, where each chain is a list of token indices
        """
        chains = []
        
        if doc._.coref_chains:
            for chain in doc._.coref_chains:
                mention_indices = []
                for mention in chain:
                    mention_indices.extend(mention.token_indexes)
                chains.append(mention_indices)
        
        return chains
    
    def get_sentence_index_for_token(self, doc, token_idx: int) -> int:
        """
        Get the sentence index for a given token index.
        
        Args:
            doc: spaCy Doc object
            token_idx: Index of the token
        
        Returns:
            Sentence index
        """
        for sent_idx, sent in enumerate(doc.sents):
            if sent.start <= token_idx < sent.end:
                return sent_idx
        return -1

In [None]:
def process_article(processor: NLPProcessor, article: dict) -> Dict[str, Any]:
    """
    Process a single article to extract entities and coreferences.
    
    Args:
        processor: NLPProcessor instance
        article: Article dictionary with 'url' and 'text' keys
    
    Returns:
        Dictionary with entities and coreference data
    """
    url = article['url']
    text = article['text']
    
    # Process full document for coreference
    doc = processor.nlp(text)
    
    # Get sentences
    sentences = sent_tokenize(text)
    
    # Extract entities from each sentence
    all_entities = []
    for sent_idx, sentence in enumerate(sentences):
        entities = processor.extract_entities_from_sentence(sentence, sent_idx, url)
        all_entities.extend(entities)
    
    # Get coreference chains
    coref_chains = processor.get_coreference_chains(doc)
    
    # Map tokens to sentences for coreference
    coref_sentence_mapping = []
    for chain_idx, chain in enumerate(coref_chains):
        chain_sentences = set()
        for token_idx in chain:
            sent_idx = processor.get_sentence_index_for_token(doc, token_idx)
            if sent_idx >= 0:
                chain_sentences.add(sent_idx)
        
        if len(chain_sentences) > 1:  # Only interested in cross-sentence chains
            coref_sentence_mapping.append({
                'chain_idx': chain_idx,
                'sentences': list(chain_sentences),
                'url': url
            })
    
    return {
        'url': url,
        'entities': [e.to_dict() for e in all_entities],
        'coreference_chains': coref_sentence_mapping
    }

In [None]:
def process_articles(articles: List[dict]) -> List[Dict[str, Any]]:
    """
    Process multiple articles for NLP analysis.
    
    Args:
        articles: List of article dictionaries
    
    Returns:
        List of processed article data
    """
    processor = NLPProcessor()
    results = []
    
    for article in articles:
        if article.get('text'):
            try:
                result = process_article(processor, article)
                results.append(result)
            except Exception as e:
                logger.warning(f"Failed to process article {article.get('url')}: {e}")
    
    logger.info(f"Successfully processed {len(results)}/{len(articles)} articles")
    return results

## Example Usage

In [None]:
# Example: Process sample text
# processor = NLPProcessor()
# sample_article = {
#     'url': 'https://example.com/article',
#     'text': 'Elon Musk announced that Tesla will expand operations. He said the company plans to hire more engineers.'
# }
# result = process_article(processor, sample_article)
# print(result)