In [23]:
from pyzotero import zotero
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import os
import logging
from time import sleep
from datetime import datetime
from tqdm import tqdm  # For progress bars

class ZoteroGroupAnalyzer:
    def __init__(self):
        """Initialize the analyzer with specific Nechako group libraries using the local Zotero API."""
        self.libraries = {
            'portal': zotero.Zotero('364018', 'group', local=True),
            'search': zotero.Zotero('5494504', 'group', local=True)
        }
        
        # Set up logging with timestamps
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        self.logger = logging.getLogger(__name__)
        
        # Verify connections and get total counts
        self.total_counts = {}
        for name, zot in self.libraries.items():
            try:
                items = zot.items(limit=1)
                total = zot.count_items()
                self.total_counts[name] = total
                self.logger.info(f"Connected to {name} library. Total items: {total}")
            except Exception as e:
                self.logger.error(f"Failed to connect to {name} library: {str(e)}")
                raise

    def get_all_items(self, zot, library_name):
        """Safely retrieve all items with detailed progress tracking."""
        all_items = []
        start = 0
        limit = 50
        total = self.total_counts[library_name]
        
        # Create progress bar
        with tqdm(total=total, desc=f"Retrieving {library_name} items") as pbar:
            while True:
                try:
                    self.logger.info(f"Fetching items {start}-{start+limit} of {total} from {library_name}")
                    items = zot.items(start=start, limit=limit)
                    if not items:
                        break
                    
                    all_items.extend(items)
                    items_added = len(items)
                    pbar.update(items_added)
                    
                    self.logger.info(f"Successfully retrieved {items_added} items from {library_name}")
                    start += limit
                    
                    if start >= total:
                        break
                    
                    sleep(0.5)  # Prevent overloading
                    
                except Exception as e:
                    self.logger.warning(f"Error retrieving items at offset {start}: {str(e)}")
                    if start > 0:  # If we've already gotten some items, return those
                        break
                    else:
                        raise
        
        self.logger.info(f"Completed retrieval for {library_name}. Got {len(all_items)} items")
        return all_items

    def extract_library_structure(self, library_name):
        """Extract complete structure with progress tracking."""
        zot = self.libraries[library_name]
        structure = {
            'collections': [],
            'items': [],
            'collection_items': []
        }
        
        try:
            # Get collections with progress tracking
            self.logger.info(f"Getting collections for {library_name}...")
            collections = zot.collections()
            self.logger.info(f"Found {len(collections)} collections in {library_name}")
            
            # Process collections with progress bar
            with tqdm(collections, desc=f"Processing {library_name} collections") as pbar:
                for coll in pbar:
                    structure['collections'].append({
                        'key': coll['key'],
                        'name': coll['data']['name'],
                        'parent': coll['data'].get('parentCollection', None)
                    })
                    
                    try:
                        items = zot.collection_items(coll['key'])
                        self.logger.info(f"Retrieved {len(items)} items from collection: {coll['data']['name']}")
                        for item in items:
                            structure['collection_items'].append({
                                'collection_key': coll['key'],
                                'item_key': item['key'],
                                'collection_name': coll['data']['name'],
                                'title': item['data'].get('title', ''),
                                'item_type': item['data'].get('itemType', '')
                            })
                    except Exception as e:
                        self.logger.warning(f"Error getting items for collection {coll['data']['name']}: {str(e)}")
            
            # Get all items using progress tracking
            self.logger.info(f"Starting retrieval of all items from {library_name}...")
            all_items = self.get_all_items(zot, library_name)
            structure['items'] = all_items
            
            return structure
            
        except Exception as e:
            self.logger.error(f"Error extracting structure from {library_name}: {str(e)}")
            raise

    def analyze_duplicates(self):
        """Compare libraries with progress tracking."""
        try:
            self.logger.info("Starting duplicate analysis...")
            
            # Get items from both libraries
            portal_items = {}
            search_items = {}
            
            for lib_name, items_dict in [('portal', portal_items), ('search', search_items)]:
                self.logger.info(f"Processing {lib_name} library for duplicates...")
                all_items = self.get_all_items(self.libraries[lib_name], lib_name)
                
                # Process items with progress bar
                with tqdm(all_items, desc=f"Processing {lib_name} titles") as pbar:
                    for item in pbar:
                        if 'title' in item['data']:
                            items_dict[item['data']['title'].lower()] = item
            
            # Find duplicates
            duplicate_titles = set(portal_items.keys()) & set(search_items.keys())
            self.logger.info(f"Found {len(duplicate_titles)} duplicate titles")
            
            # Create visualization
            self.logger.info("Creating Venn diagram...")
            plt.figure(figsize=(10, 10))
            venn2([set(portal_items.keys()), set(search_items.keys())],
                  set_labels=('Portal Library', 'Search Library'))
            plt.title('Library Content Overlap')
            plt.savefig('library_overlap.png')
            plt.close()
            
            # Process duplicates with progress bar
            duplicates_data = []
            with tqdm(duplicate_titles, desc="Processing duplicates") as pbar:
                for title in pbar:
                    duplicates_data.append({
                        'title': title,
                        'portal_id': portal_items[title]['key'] if title in portal_items else '',
                        'search_id': search_items[title]['key'] if title in search_items else '',
                        'item_type': portal_items[title]['data'].get('itemType', '') if title in portal_items else search_items[title]['data'].get('itemType', '')
                    })
            
            duplicates_df = pd.DataFrame(duplicates_data)
            
            return {
                'duplicates': duplicate_titles,
                'duplicates_df': duplicates_df,
                'stats': {
                    'portal_total': len(portal_items),
                    'search_total': len(search_items),
                    'duplicate_count': len(duplicate_titles)
                }
            }
            
        except Exception as e:
            self.logger.error(f"Error analyzing duplicates: {str(e)}")
            raise

    def export_analysis(self, output_dir='nechako_analysis'):
        """Export analysis with progress tracking."""
        try:
            os.makedirs(output_dir, exist_ok=True)
            self.logger.info(f"Created output directory: {output_dir}")
            
            # Process each library
            for lib_name in self.libraries:
                self.logger.info(f"\nStarting analysis of {lib_name} library...")
                structure = self.extract_library_structure(lib_name)
                
                # Export to Excel with progress updates
                excel_path = os.path.join(output_dir, f'{lib_name}_library_analysis.xlsx')
                self.logger.info(f"Exporting {lib_name} analysis to {excel_path}")
                
                with pd.ExcelWriter(excel_path) as writer:
                    # Collections
                    self.logger.info(f"Writing {len(structure['collections'])} collections...")
                    pd.DataFrame(structure['collections']).to_excel(writer, sheet_name='Collections', index=False)
                    
                    # Items
                    self.logger.info(f"Writing {len(structure['items'])} items...")
                    try:
                        items_df = pd.json_normalize([item['data'] for item in structure['items']])
                    except Exception as e:
                        self.logger.warning(f"Error normalizing items data: {str(e)}")
                        items_df = pd.DataFrame(structure['items'])
                    items_df.to_excel(writer, sheet_name='Items', index=False)
                    
                    # Collection items
                    self.logger.info(f"Writing {len(structure['collection_items'])} collection items...")
                    pd.DataFrame(structure['collection_items']).to_excel(writer, sheet_name='Collection_Items', index=False)
                
                self.logger.info(f"Completed export for {lib_name}")
            
            # Export duplicate analysis
            self.logger.info("\nPerforming duplicate analysis...")
            duplicate_analysis = self.analyze_duplicates()
            duplicate_path = os.path.join(output_dir, 'duplicate_analysis.xlsx')
            
            self.logger.info("Exporting duplicate analysis...")
            with pd.ExcelWriter(duplicate_path) as writer:
                pd.DataFrame([duplicate_analysis['stats']]).to_excel(writer, sheet_name='Summary', index=False)
                if 'duplicates_df' in duplicate_analysis:
                    duplicate_analysis['duplicates_df'].to_excel(writer, sheet_name='Detailed_Duplicates', index=False)
            
            self.logger.info(f"Analysis export completed. Files saved in: {output_dir}")
            
        except Exception as e:
            self.logger.error(f"Error exporting analysis: {str(e)}")
            raise

def main():
    start_time = datetime.now()
    print(f"Analysis started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    try:
        # Initialize analyzer
        analyzer = ZoteroGroupAnalyzer()
        
        # Export full analysis
        analyzer.export_analysis()
        
        # Print summary
        duplicate_analysis = analyzer.analyze_duplicates()
        print("\nAnalysis Summary:")
        print(f"Portal Library Items: {duplicate_analysis['stats']['portal_total']}")
        print(f"Search Library Items: {duplicate_analysis['stats']['search_total']}")
        print(f"Duplicate Items: {duplicate_analysis['stats']['duplicate_count']}")
        
        end_time = datetime.now()
        duration = end_time - start_time
        print(f"\nAnalysis completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Total duration: {duration}")
        
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        raise

if __name__ == "__main__":
    main()

INFO:__main__:Connected to portal library. Total items: 4789
INFO:__main__:Connected to search library. Total items: 1316
INFO:__main__:Created output directory: nechako_analysis
INFO:__main__:
Starting analysis of portal library...
INFO:__main__:Getting collections for portal...
INFO:__main__:Found 31 collections in portal


Analysis started at: 2024-12-03 12:21:13


Processing portal collections:   0%|          | 0/31 [00:00<?, ?it/s]INFO:__main__:Retrieved 0 items from collection: 1. Data Sources
INFO:__main__:Retrieved 0 items from collection: Barry's Collection
INFO:__main__:Retrieved 0 items from collection: Cheslatta
INFO:__main__:Retrieved 0 items from collection: EDI
INFO:__main__:Retrieved 100 items from collection: Large Lakes Monitoring
INFO:__main__:Retrieved 0 items from collection: NWR
INFO:__main__:Retrieved 0 items from collection: Saturation Search
INFO:__main__:Retrieved 0 items from collection: 2. Team Work Folder
INFO:__main__:Retrieved 0 items from collection: Christiana
INFO:__main__:Retrieved 13 items from collection: Isla
INFO:__main__:Retrieved 0 items from collection: Jonathan
Processing portal collections:  35%|███▌      | 11/31 [00:00<00:00, 106.19it/s]INFO:__main__:Retrieved 79 items from collection: Sample 1 (Jan 10th, 2022)
INFO:__main__:Retrieved 0 items from collection: Lisa
INFO:__main__:Retrieved 9 items from coll


Analysis Summary:
Portal Library Items: 100
Search Library Items: 70
Duplicate Items: 1

Analysis completed at: 2024-12-03 12:24:44
Total duration: 0:03:31.633023





In [4]:
from pyzotero import zotero
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import os
import logging
from time import sleep
from datetime import datetime
from tqdm import tqdm
import xml.etree.ElementTree as ET

class ZoteroGroupAnalyzer:
    def __init__(self, xml_paths=None):
        """Initialize the analyzer with specific Nechako group libraries using the local Zotero API."""
        self.libraries = {
            'portal': zotero.Zotero('364018', 'group', local=True),
            'search': zotero.Zotero('5494504', 'group', local=True)
        }
        
        self.xml_paths = xml_paths or {}
        self.xml_data = {}
        
        # Set up logging with timestamps
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        self.logger = logging.getLogger(__name__)
        
        # Verify connections and get total counts
        self.total_counts = {}
        for name, zot in self.libraries.items():
            try:
                items = zot.items(limit=1)
                total = zot.count_items()
                self.total_counts[name] = total
                self.logger.info(f"Connected to {name} library. Total items: {total}")
            except Exception as e:
                self.logger.error(f"Failed to connect to {name} library: {str(e)}")
                raise

        if self.xml_paths:
            self._load_xml_data()

    def get_all_items(self, zot, library_name):
        """Safely retrieve all items with detailed progress tracking."""
        all_items = []
        start = 0
        limit = 50
        total = self.total_counts[library_name]
        
        # Create progress bar
        with tqdm(total=total, desc=f"Retrieving {library_name} items") as pbar:
            while True:
                try:
                    self.logger.info(f"Fetching items {start}-{start+limit} of {total} from {library_name}")
                    items = zot.items(start=start, limit=limit)
                    if not items:
                        break
                    
                    all_items.extend(items)
                    items_added = len(items)
                    pbar.update(items_added)
                    
                    self.logger.info(f"Successfully retrieved {items_added} items from {library_name}")
                    start += limit
                    
                    if start >= total:
                        break
                    
                    sleep(0.5)  # Prevent overloading
                    
                except Exception as e:
                    self.logger.warning(f"Error retrieving items at offset {start}: {str(e)}")
                    if start > 0:  # If we've already gotten some items, return those
                        break
                    else:
                        raise
        
        self.logger.info(f"Completed retrieval for {library_name}. Got {len(all_items)} items")
        return all_items

    def _load_xml_data(self):
        """Load and parse XML files."""
        for library_name, paths in self.xml_paths.items():
            self.xml_data[library_name] = {
                'deduplicated': self._parse_xml(paths['deduplicated']),
                'duplicates': self._parse_xml(paths['duplicates'])
            }
            self.logger.info(f"Loaded XML data for {library_name}")

    def _parse_xml(self, file_path):
        """Parse XML file and return structured data."""
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            items = []
            for entry in root.findall('.//entry'):
                item = {
                    'title': entry.find('title').text if entry.find('title') is not None else '',
                    'authors': [author.text for author in entry.findall('.//author')],
                    'doi': entry.find('doi').text if entry.find('doi') is not None else '',
                    'year': entry.find('year').text if entry.find('year') is not None else '',
                }
                items.append(item)
            
            return items
            
        except Exception as e:
            self.logger.error(f"Error parsing XML file {file_path}: {str(e)}")
            return []

    def extract_library_structure(self, library_name):
        """Extract complete structure with progress tracking."""
        zot = self.libraries[library_name]
        structure = {
            'collections': [],
            'items': [],
            'collection_items': []
        }
        
        try:
            # Get collections with progress tracking
            self.logger.info(f"Getting collections for {library_name}...")
            collections = zot.collections()
            self.logger.info(f"Found {len(collections)} collections in {library_name}")
            
            # Process collections with progress bar
            with tqdm(collections, desc=f"Processing {library_name} collections") as pbar:
                for coll in pbar:
                    structure['collections'].append({
                        'key': coll['key'],
                        'name': coll['data']['name'],
                        'parent': coll['data'].get('parentCollection', None)
                    })
                    
                    try:
                        items = zot.collection_items(coll['key'])
                        self.logger.info(f"Retrieved {len(items)} items from collection: {coll['data']['name']}")
                        for item in items:
                            structure['collection_items'].append({
                                'collection_key': coll['key'],
                                'item_key': item['key'],
                                'collection_name': coll['data']['name'],
                                'title': item['data'].get('title', ''),
                                'item_type': item['data'].get('itemType', '')
                            })
                    except Exception as e:
                        self.logger.warning(f"Error getting items for collection {coll['data']['name']}: {str(e)}")
            
            # Get all items using progress tracking
            self.logger.info(f"Starting retrieval of all items from {library_name}...")
            all_items = self.get_all_items(zot, library_name)
            structure['items'] = all_items
            
            return structure
            
        except Exception as e:
            self.logger.error(f"Error extracting structure from {library_name}: {str(e)}")
            raise

    def analyze_duplicates(self):
        """Compare libraries with progress tracking."""
        try:
            self.logger.info("Starting duplicate analysis...")
            
            # Get items from both libraries
            portal_items = {}
            search_items = {}
            
            for lib_name, items_dict in [('portal', portal_items), ('search', search_items)]:
                self.logger.info(f"Processing {lib_name} library for duplicates...")
                all_items = self.get_all_items(self.libraries[lib_name], lib_name)
                
                # Process items with progress bar
                with tqdm(all_items, desc=f"Processing {lib_name} titles") as pbar:
                    for item in pbar:
                        if 'title' in item['data']:
                            items_dict[item['data']['title'].lower()] = item
            
            # Find duplicates
            duplicate_titles = set(portal_items.keys()) & set(search_items.keys())
            self.logger.info(f"Found {len(duplicate_titles)} duplicate titles")
            
            # Create visualization
            self.logger.info("Creating Venn diagram...")
            plt.figure(figsize=(10, 10))
            venn2([set(portal_items.keys()), set(search_items.keys())],
                  set_labels=('Portal Library', 'Search Library'))
            plt.title('Library Content Overlap')
            plt.savefig('library_overlap.png')
            plt.close()
            
            # Process duplicates with progress bar
            duplicates_data = []
            with tqdm(duplicate_titles, desc="Processing duplicates") as pbar:
                for title in pbar:
                    duplicates_data.append({
                        'title': title,
                        'portal_id': portal_items[title]['key'] if title in portal_items else '',
                        'search_id': search_items[title]['key'] if title in search_items else '',
                        'item_type': portal_items[title]['data'].get('itemType', '') if title in portal_items else search_items[title]['data'].get('itemType', '')
                    })
            
            duplicates_df = pd.DataFrame(duplicates_data)
            
            return {
                'duplicates': duplicate_titles,
                'duplicates_df': duplicates_df,
                'stats': {
                    'portal_total': len(portal_items),
                    'search_total': len(search_items),
                    'duplicate_count': len(duplicate_titles)
                }
            }
            
        except Exception as e:
            self.logger.error(f"Error analyzing duplicates: {str(e)}")
            raise

    def export_analysis(self, output_dir='nechako_analysis'):
        """Export analysis with progress tracking."""
        try:
            os.makedirs(output_dir, exist_ok=True)
            self.logger.info(f"Created output directory: {output_dir}")
            
            # Process each library
            for lib_name in self.libraries:
                self.logger.info(f"\nStarting analysis of {lib_name} library...")
                structure = self.extract_library_structure(lib_name)
                
                # Export to Excel with progress updates
                excel_path = os.path.join(output_dir, f'{lib_name}_library_analysis.xlsx')
                self.logger.info(f"Exporting {lib_name} analysis to {excel_path}")
                
                with pd.ExcelWriter(excel_path) as writer:
                    # Collections
                    self.logger.info(f"Writing {len(structure['collections'])} collections...")
                    pd.DataFrame(structure['collections']).to_excel(writer, sheet_name='Collections', index=False)
                    
                    # Items
                    self.logger.info(f"Writing {len(structure['items'])} items...")
                    try:
                        items_df = pd.json_normalize([item['data'] for item in structure['items']])
                    except Exception as e:
                        self.logger.warning(f"Error normalizing items data: {str(e)}")
                        items_df = pd.DataFrame(structure['items'])
                    items_df.to_excel(writer, sheet_name='Items', index=False)
                    
                    # Collection items
                    self.logger.info(f"Writing {len(structure['collection_items'])} collection items...")
                    pd.DataFrame(structure['collection_items']).to_excel(writer, sheet_name='Collection_Items', index=False)
                
                self.logger.info(f"Completed export for {lib_name}")
            
            # Export duplicate analysis
            self.logger.info("\nPerforming duplicate analysis...")
            duplicate_analysis = self.analyze_duplicates()
            
            if self.xml_paths:
                xml_analysis = self.analyze_with_xml()
                self.export_xml_analysis(xml_analysis, output_dir)
                
            duplicate_path = os.path.join(output_dir, 'duplicate_analysis.xlsx')
            
            self.logger.info("Exporting duplicate analysis...")
            with pd.ExcelWriter(duplicate_path) as writer:
                pd.DataFrame([duplicate_analysis['stats']]).to_excel(writer, sheet_name='Summary', index=False)
                if 'duplicates_df' in duplicate_analysis:
                    duplicate_analysis['duplicates_df'].to_excel(writer, sheet_name='Detailed_Duplicates', index=False)
            
            self.logger.info(f"Analysis export completed. Files saved in: {output_dir}")
            
        except Exception as e:
            self.logger.error(f"Error exporting analysis: {str(e)}")
            raise

    def analyze_with_xml(self):
        """Analyze libraries with XML integration."""
        try:
            self.logger.info("Starting integrated analysis...")
            results = {}
            
            for lib_name in self.libraries:
                if lib_name not in self.xml_data:
                    continue
                    
                self.logger.info(f"Processing {lib_name} library...")
                zotero_items = self.get_all_items(self.libraries[lib_name], lib_name)
                xml_items = self.xml_data[lib_name]
                
                # Match items
                matches = self._match_items(zotero_items, xml_items)
                
                results[lib_name] = {
                    'total_zotero': len(zotero_items),
                    'total_deduplicated': len(xml_items.get('deduplicated', [])),
                    'total_duplicates': len(xml_items.get('duplicates', [])),
                    'matches': matches
                }
            
            return results
            
        except Exception as e:
            self.logger.error(f"Error in XML analysis: {str(e)}")
            raise

    def _match_items(self, zotero_items, xml_items):
        """Match Zotero items with XML items."""
        matches = []
        
        for zot_item in tqdm(zotero_items, desc="Matching items"):
            zot_title = zot_item['data'].get('title', '').lower()
            if not zot_title:
                continue
                
            # Check in deduplicated items
            for xml_item in xml_items.get('deduplicated', []):
                if xml_item['title'].lower() == zot_title:
                    matches.append({
                        'zotero_item': zot_item,
                        'xml_item': xml_item,
                        'status': 'deduplicated'
                    })
                    break
                    
            # Check in duplicate items
            for xml_item in xml_items.get('duplicates', []):
                if xml_item['title'].lower() == zot_title:
                    matches.append({
                        'zotero_item': zot_item,
                        'xml_item': xml_item,
                        'status': 'duplicate'
                    })
                    break
        
        return matches

    def export_xml_analysis(self, results, output_dir):
        """Export XML analysis results."""
        for lib_name, lib_results in results.items():
            excel_path = os.path.join(output_dir, f'{lib_name}_xml_analysis.xlsx')
            
            with pd.ExcelWriter(excel_path) as writer:
                # Summary
                summary_data = {
                    'Metric': ['Total Zotero Items', 'Total Deduplicated XML Items', 'Total Duplicate XML Items', 'Matched Items'],
                    'Count': [
                        lib_results['total_zotero'],
                        lib_results['total_deduplicated'],
                        lib_results['total_duplicates'],
                        len(lib_results['matches'])
                    ]
                }
                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
                
                # Detailed matches
                matches_data = []
                for match in lib_results['matches']:
                    matches_data.append({
                        'Zotero Title': match['zotero_item']['data'].get('title', ''),
                        'XML Title': match['xml_item']['title'],
                        'Status': match['status'],
                        'DOI': match['zotero_item']['data'].get('DOI', ''),
                        'XML DOI': match['xml_item']['doi'],
                        'Year': match['zotero_item']['data'].get('date', '')[:4],
                        'XML Year': match['xml_item']['year'],
                        'Item Type': match['zotero_item']['data'].get('itemType', ''),
                        'Collection Key': match['zotero_item']['key']
                    })
                
                matches_df = pd.DataFrame(matches_data)
                matches_df.to_excel(writer, sheet_name='Matched_Items', index=False)

def main():
    start_time = datetime.now()
    print(f"Analysis started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    try:
        # Define XML file paths
        xml_paths = {
            'search': {
                'deduplicated': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml',
                'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0911.xml'
            }
            # Add portal paths when available
        }
        
        # Initialize analyzer with XML paths
        analyzer = ZoteroGroupAnalyzer(xml_paths)
        
        # Export both standard and XML analysis
        analyzer.export_analysis()
        
        # Print summary
        duplicate_analysis = analyzer.analyze_duplicates()
        print("\nAnalysis Summary:")
        print(f"Portal Library Items: {duplicate_analysis['stats']['portal_total']}")
        print(f"Search Library Items: {duplicate_analysis['stats']['search_total']}")
        print(f"Duplicate Items: {duplicate_analysis['stats']['duplicate_count']}")
        
        if analyzer.xml_paths:
            xml_analysis = analyzer.analyze_with_xml()
            print("\nXML Analysis Summary:")
            for lib_name, results in xml_analysis.items():
                print(f"\n{lib_name.title()} Library:")
                print(f"Total Zotero Items: {results['total_zotero']}")
                print(f"Total Deduplicated XML Items: {results['total_deduplicated']}")
                print(f"Total Duplicate XML Items: {results['total_duplicates']}")
                print(f"Matched Items: {len(results['matches'])}")
        
        end_time = datetime.now()
        duration = end_time - start_time
        print(f"\nAnalysis completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Total duration: {duration}")
        
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2024-12-04 11:14:21 - Connected to portal library. Total items: 4789
2024-12-04 11:14:21 - Connected to search library. Total items: 1316
2024-12-04 11:14:21 - Loaded XML data for search
2024-12-04 11:14:21 - Created output directory: nechako_analysis
2024-12-04 11:14:21 - 
Starting analysis of portal library...
2024-12-04 11:14:21 - Getting collections for portal...
2024-12-04 11:14:21 - Found 31 collections in portal


Analysis started at: 2024-12-04 11:14:21


Processing portal collections:   0%|          | 0/31 [00:00<?, ?it/s]2024-12-04 11:14:21 - Retrieved 0 items from collection: 1. Data Sources
2024-12-04 11:14:21 - Retrieved 0 items from collection: Barry's Collection
2024-12-04 11:14:21 - Retrieved 0 items from collection: Cheslatta
2024-12-04 11:14:21 - Retrieved 0 items from collection: EDI
2024-12-04 11:14:21 - Retrieved 100 items from collection: Large Lakes Monitoring
2024-12-04 11:14:21 - Retrieved 0 items from collection: NWR
2024-12-04 11:14:21 - Retrieved 0 items from collection: Saturation Search
2024-12-04 11:14:21 - Retrieved 0 items from collection: 2. Team Work Folder
2024-12-04 11:14:21 - Retrieved 0 items from collection: Christiana
2024-12-04 11:14:21 - Retrieved 13 items from collection: Isla
2024-12-04 11:14:21 - Retrieved 0 items from collection: Jonathan
Processing portal collections:  35%|███▌      | 11/31 [00:00<00:00, 108.28it/s]2024-12-04 11:14:21 - Retrieved 79 items from collection: Sample 1 (Jan 10th, 2022)


Analysis Summary:
Portal Library Items: 100
Search Library Items: 70
Duplicate Items: 1


Retrieving search items:   0%|          | 0/1316 [00:00<?, ?it/s]2024-12-04 11:18:03 - Fetching items 0-50 of 1316 from search
2024-12-04 11:18:03 - Successfully retrieved 100 items from search
2024-12-04 11:18:03 - Fetching items 50-100 of 1316 from search
Retrieving search items:  15%|█▌        | 200/1316 [00:00<00:03, 339.83it/s]2024-12-04 11:18:03 - Successfully retrieved 100 items from search
2024-12-04 11:18:04 - Fetching items 100-150 of 1316 from search
Retrieving search items:  23%|██▎       | 300/1316 [00:01<00:04, 247.47it/s]2024-12-04 11:18:04 - Successfully retrieved 100 items from search
2024-12-04 11:18:04 - Fetching items 150-200 of 1316 from search
Retrieving search items:  30%|███       | 400/1316 [00:01<00:04, 219.70it/s]2024-12-04 11:18:04 - Successfully retrieved 100 items from search
2024-12-04 11:18:05 - Fetching items 200-250 of 1316 from search
Retrieving search items:  38%|███▊      | 500/1316 [00:02<00:03, 205.55it/s]2024-12-04 11:18:05 - Successfully retriev


XML Analysis Summary:

Search Library:
Total Zotero Items: 2700
Total Deduplicated XML Items: 0
Total Duplicate XML Items: 0
Matched Items: 0

Analysis completed at: 2024-12-04 11:18:17
Total duration: 0:03:56.278361



