In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

# Define the file paths
xml_data = {
    'portal': {
        'Cleaned': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml',
        'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml'
    },
    'search': {
        'cleaned': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0911.xml',
        'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml'
    }
}

def extract_record_data(record):
    """Extract relevant fields from a record element"""
    data = {}
    for child in record:
        if len(child) == 0:
            data[child.tag] = child.text
        else:
            data[child.tag] = [subchild.text for subchild in child]
    return data

# Dictionary to store final combined DataFrames
combined_data = {}

# Process each dataset (portal and search)
for dataset in ['portal', 'search']:
    # Lists to store records from both deduped and duplicate files
    all_records = []
    
    # Process each file type (deduped and duplicates)
    for file_type, file_path in xml_data[dataset].items():
        try:
            # Parse XML file
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            # Extract records
            records = root.findall('.//record')
            
            # Convert records to dictionaries and add status
            for record in records:
                record_data = extract_record_data(record)
                record_data['status'] = 'deduped' if file_type == 'deduped' else 'cleaned'
                all_records.append(record_data)
                
            print(f"Loaded {dataset} {file_type} file: {len(records)} records")
            
        except Exception as e:
            print(f"Error loading {dataset} {file_type} file: {e}")
    
    # Create combined DataFrame for this dataset
    combined_df = pd.DataFrame(all_records)
    combined_data[dataset] = combined_df
    
    # Print summary
    print(f"\n{dataset.capitalize()} Dataset Summary:")
    print(f"Total records: {len(combined_df)}")
    print("Records by status:")
    print(combined_df['status'].value_counts())
    print("Columns:", list(combined_df.columns), "\n")

# Now you can access:
# combined_data['portal'] - for portal dataset
# combined_data['search'] - for search dataset

Loaded portal Cleaned file: 575 records
Loaded portal duplicates file: 1709 records

Portal Dataset Summary:
Total records: 2284
Records by status:
status
cleaned    2284
Name: count, dtype: int64
Columns: ['database', 'source-app', 'rec-number', 'foreign-keys', 'ref-type', 'contributors', 'titles', 'abstract', 'electronic-resource-num', 'isbn', 'number', 'pages', 'volume', 'dates', 'urls', 'status', 'keywords', 'research-notes', 'custom3', 'custom1', 'custom5'] 

Loaded search cleaned file: 1 records
Loaded search duplicates file: 765 records

Search Dataset Summary:
Total records: 766
Records by status:
status
cleaned    766
Name: count, dtype: int64
Columns: ['database', 'source-app', 'rec-number', 'foreign-keys', 'ref-type', 'contributors', 'titles', 'abstract', 'electronic-resource-num', 'isbn', 'number', 'pages', 'research-notes', 'volume', 'dates', 'keywords', 'status', 'custom3', 'urls'] 

