In [None]:
#  01_data_loading_and_cleanup.ipynb

# 1. Environment setup
# (install requirements first using terminal)

# 2. Imports
import os
import sys
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import Path
import re
import unicodedata
from src.config import RAW_DIR, PROCESSED_DIR

# 3. Add project root to Python path
PROJECT_ROOT = Path(os.getcwd()).parent
sys.path.append(str(PROJECT_ROOT))

print("Project root added:", PROJECT_ROOT)


# 4. Data loader function for SemEval XML

def load_semeval(path):
    tree = ET.parse(path)
    root = tree.getroot()

    data = []

    for s in root.iter('sentence'):
        sid = s.attrib.get("id")
        text = s.find("text").text if s.find("text") is not None else ""

        aspects = []

        # aspectTerms
        at = s.find("aspectTerms")
        if at is not None:
            for term in at.findall("aspectTerm"):
                aspects.append({
                    "type": "term",
                    "term": term.attrib.get("term"),
                    "polarity": term.attrib.get("polarity")
                })

        # aspectCategories
        ac = s.find("aspectCategories")
        if ac is not None:
            for cat in ac.findall("aspectCategory"):
                aspects.append({
                    "type": "category",
                    "category": cat.attrib.get("category"),
                    "polarity": cat.attrib.get("polarity")
                })

        data.append({
            "id": sid,
            "sentence": text,
            "aspects": aspects
        })

    return pd.DataFrame(data)


# 5. Example: load dataset
semeval_path = RAW_DIR / 'Restaurants_Train.xml'
if semeval_path.exists():
    df = load_semeval(semeval_path)
else:
    # If file not present, create an empty DataFrame template
    df = pd.DataFrame(columns=['id', 'sentence', 'aspects'])
    print(f"Warning: {semeval_path} not found. Edit RAW_DIR and add file.")

# 6. Quick inspection
print('Total sentences:', len(df))
if len(df) > 0:
    display(df.head())

# 7. Basic validation 

def count_aspect_stats(df):
    total_sent = len(df)
    total_with_aspect = df['aspects'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
    num_sent_with = (df['aspects'].apply(lambda x: len(x) if isinstance(x, list) else 0) > 0).sum()
    return {'total_sentences': total_sent, 'aspect_mentions': total_with_aspect, 'sentences_with_aspects': num_sent_with}

print(count_aspect_stats(df))

# 8. Save a copy of raw and a lightweight CSV for quick use
raw_out = RAW_DIR / 'semeval_loaded_raw.parquet'
proc_out = PROCESSED_DIR / 'semeval_sentences.csv'

if len(df) > 0:
    df.to_parquet(raw_out, index=False)
    # Save a small CSV with aspects as string for quick preview
    df_preview = df.copy()
    df_preview['aspects_str'] = df_preview['aspects'].apply(lambda x: str(x))
    df_preview[['id','sentence','aspects_str']].to_csv(proc_out, index=False)
    print('Saved raw parquet ->', raw_out)
    print('Saved preview csv ->', proc_out)

# 9. Basic cleaning utilities

def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

def fix_whitespace(text):
    return re.sub(r"\s+", " ", text).strip()

def normalize_elongation(text):
    # Reduce characters repeated more than twice: "soooo" → "soo"
    return re.sub(r"(.)\1{2,}", r"\1\1", text)

def normalize_punctuation(text):
    # Reduce repeated punctuation: "!!!" → "!!"
    return re.sub(r"([!?.,])\1{1,}", r"\1\1", text)

def clean_spelling_simple(text):
    corrections = {
        "personaly": "personally",
        "definately": "definitely",
        "amazng": "amazing",
        "restarant": "restaurant"
    }
    for wrong, right in corrections.items():
        text = re.sub(rf"\b{wrong}\b", right, text, flags=re.IGNORECASE)
    return text

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = normalize_unicode(text)
    text = fix_whitespace(text)
    text = normalize_elongation(text)
    text = normalize_punctuation(text)
    text = clean_spelling_simple(text)

    return text


# Apply cleaning to a new column
if len(df) > 0:
    df['sentence_clean'] = df['sentence'].apply(clean_text)
    display(df[['id','sentence','sentence_clean']].head())

# Save processed df
if len(df) > 0:
    df.to_parquet(PROCESSED_DIR / 'semeval_processed.parquet', index=False)

