In [None]:
# 02_dependency_aspect_extractor

# 1. Imports 
import sys
from pathlib import Path

# Add project root so that src/ module is importable
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

import spacy
import pandas as pd
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


# 2. Load processed data created in Notebook 01
from src.config import PROCESSED_DIR

proc_file = PROCESSED_DIR / 'semeval_processed.parquet'
if proc_file.exists():
    df = pd.read_parquet(proc_file)
else:
    df = pd.DataFrame(columns=['id','sentence','aspects','sentence_clean'])
    print('Warning: processed file not found; ensure notebook 1 ran successfully.')

# 3. Dependency-based extractor 
def extract_opinion_words(doc, aspect_term):
    """
    Find opinion-bearing words for a given aspect term using dependency parsing.
    """
    opinions = set()

    for token in doc:
        # Adjectives modifying the noun
        if token.pos_ == "ADJ":
            for child in token.children:
                if child.text.lower() == aspect_term.lower():
                    opinions.add(token.text)

            if token.head.text.lower() == aspect_term.lower():
                opinions.add(token.text)

        # Verbs describing the aspect
        if token.pos_ == "VERB":
            for child in token.children:
                if child.text.lower() == aspect_term.lower():
                    opinions.add(token.text)

            if token.head.text.lower() == aspect_term.lower():
                opinions.add(token.text)

    return list(opinions)

# 4. Apply extractor in batch and store in new column
from tqdm import tqdm

parsed_relations = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    doc = nlp(row["sentence_clean"])
    aspect_list = row["aspects"]

    extracted = []

    for asp in aspect_list:
        if asp["type"] == "term":
            term = asp["term"]
            polarity = asp["polarity"]
            opinions = extract_opinion_words(doc, term)

            extracted.append({
                "term": term,
                "polarity": polarity,
                "opinions": opinions
            })

        elif asp["type"] == "category":
            extracted.append({
                "category": asp["category"],
                "polarity": asp["polarity"],
                "opinions": []
            })

    parsed_relations.append(extracted)

assert len(parsed_relations) == len(df)

df["dep_aspects"] = parsed_relations
df[["sentence_clean", "aspects", "dep_aspects"]].head(10)


# 5. static check
print("Total sentences:", len(df))

total_opinions = df["dep_aspects"].apply(
    lambda x: sum(len(a.get("opinions", [])) for a in x)
).sum()

print("Total extracted opinion words:", total_opinions)
print("Average opinions per sentence:", total_opinions / len(df))


# 6. Save results
if len(df) > 0:
    df.to_parquet(PROCESSED_DIR / 'semeval_with_dep_aspects.parquet', index=False)
    
df[["sentence_clean", "aspects", "dep_aspects"]].head(10)


