In [11]:
import pandas as pd

df = pd.read_csv('processed_vietnamese_texts_combined.csv')

In [12]:
# Let's first examine the structure with just 1 row as requested
print("Dataset columns:")
print(df.columns.tolist())
print("\nDataset shape:", df.shape)
print("\nFirst row sample:")
sample_row = df.iloc[0:5]
print(sample_row)

# Save just 1 row for reference as requested
# df[0:1].to_csv('sample_ref.csv',index=False)

Dataset columns:
['url', 'title', 'content', 'title_processed', 'title_normalized', 'title_token_count', 'content_processed', 'content_normalized', 'content_token_count']

Dataset shape: (1163, 9)

First row sample:
                                                 url  \
0  https://dsvh.gov.vn/le-cung-ban-vuong-cua-nguo...   
1  https://dsvh.gov.vn/le-cau-mua-cua-nguoi-co-la...   
2  https://dsvh.gov.vn/nghe-thuat-che-bien-mon-an...   
3  https://dsvh.gov.vn/nghe-thuat-mua-khen-cua-ng...   
4     https://dsvh.gov.vn/le-hoi-dinh-thay-thim-3454   

                                        title  \
0          Lễ cúng Bàn vương của người Dao đỏ   
1              Lễ cầu mùa của người Cờ Lao đỏ   
2  Nghệ thuật chế biến món ăn chay ở Tây Ninh   
3          Nghệ thuật múa Khèn của người Mông   
4                       Lễ hội dinh Thầy Thím   

                                             content        title_processed  \
0  Người Dao đỏ (xã Hồ Thầu, huyện Hoàng Su Phì, ...             Lễ cúng 

In [None]:
# Fixed Vietnamese NER with underthesea - handling 4-tuple format
import pandas as pd
from collections import defaultdict

def extract_entities_fixed(text):
    """Extract entities using underthesea NER - fixed for 4-tuple format"""
    if not text or len(text.strip()) == 0:
        return []
    
    try:
        ner_results = ner(text[:800])  # Slightly longer text for better context
        
        entities = []
        current_entity = []
        current_type = None
        
        for result in ner_results:
            # Handle the 4-tuple format: (word, pos_tag, chunk_tag, ner_tag)
            if isinstance(result, tuple):
                if len(result) == 4:
                    word, pos_tag, chunk_tag, ner_tag = result
                elif len(result) == 3:
                    word, pos_tag, ner_tag = result
                elif len(result) == 2:
                    word, ner_tag = result
                else:
                    continue
            else:
                continue
            
            # Process NER tags (B-*, I-*, O)
            if ner_tag.startswith('B-'):  # Beginning of entity
                # Save previous entity if exists
                if current_entity and current_type:
                    entity_text = ' '.join(current_entity).strip()
                    if len(entity_text) > 1:  # Skip single characters
                        entities.append({
                            'text': entity_text,
                            'type': current_type
                        })
                
                # Start new entity
                current_entity = [word]
                current_type = ner_tag[2:]  # Remove 'B-' prefix
                
            elif ner_tag.startswith('I-') and current_type == ner_tag[2:]:
                # Continue current entity
                current_entity.append(word)
                
            else:
                # End current entity (O tag or different entity type)
                if current_entity and current_type:
                    entity_text = ' '.join(current_entity).strip()
                    if len(entity_text) > 1:
                        entities.append({
                            'text': entity_text,
                            'type': current_type
                        })
                current_entity = []
                current_type = None
        
        # Don't forget the last entity
        if current_entity and current_type:
            entity_text = ' '.join(current_entity).strip()
            if len(entity_text) > 1:
                entities.append({
                    'text': entity_text,
                    'type': current_type
                })
        
        return entities
        
    except Exception as e:
        print(f"Error in NER: {e}")
        return []

def process_vietnamese_heritage_texts(df, num_docs=10):
    """Process Vietnamese heritage texts with entity extraction"""
    print(f"🏛️ Processing {num_docs} Vietnamese heritage documents...")
    
    all_entities = defaultdict(list)
    doc_entities = []
    
    for idx, row in df.head(num_docs).iterrows():
        title = str(row['title']) if pd.notna(row['title']) else ""
        content = str(row['content']) if pd.notna(row['content']) else ""
        full_text = f"{title} {content}"
        
        print(f"\n📜 Document {idx+1}: {title[:60]}...")
        
        # Extract all entities
        entities = extract_entities_fixed(full_text)
        
        # Categorize entities
        locations = [e for e in entities if e['type'] in ['LOC', 'LOCATION']]
        people = [e for e in entities if e['type'] in ['PER', 'PERSON']]
        organizations = [e for e in entities if e['type'] in ['ORG', 'ORGANIZATION']]
        
        doc_info = {
            'doc_index': idx,
            'title': title,
            'total_entities': len(entities),
            'locations': locations,
            'people': people,
            'organizations': organizations,
            'all_entities': entities
        }
        doc_entities.append(doc_info)
        
        # Add to global collection
        for entity in entities:
            all_entities[entity['type']].append({
                'text': entity['text'],
                'doc_title': title,
                'doc_index': idx
            })
        
        # Print quick summary for this doc
        print(f"   ✅ Found {len(entities)} entities:")
        if len(entities) > 0:
            entity_types = list(set([e['type'] for e in entities]))
            for et in entity_types:
                count = len([e for e in entities if e['type'] == et])
                examples = [e['text'] for e in entities if e['type'] == et][:3]
                print(f"      {et}: {count} entities - {', '.join(examples)}")
        else:
            print("      No entities found")
    
    # Final summary
    print(f"\n🎯 HERITAGE ENTITY SUMMARY:")
    print("=" * 50)
    for entity_type, entities in all_entities.items():
        unique_entities = list(set([e['text'] for e in entities]))
        print(f"{entity_type}: {len(entities)} total, {len(unique_entities)} unique")
        
        # Show top examples
        for i, entity in enumerate(unique_entities[:5]):
            count = len([e for e in entities if e['text'] == entity])
            print(f"  • {entity} (appears {count} times)")
        if len(unique_entities) > 5:
            print(f"  ... and {len(unique_entities)-5} more unique entities")
        print()
    
    return all_entities, doc_entities

# Import NER function
from underthesea import ner

# Test the fixed version
print("🚀 Testing FIXED underthesea NER on Vietnamese heritage texts...")
entities, doc_details = process_vietnamese_heritage_texts(df, num_docs=3)
