In [3]:
from pyzotero import zotero
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import networkx as nx
from collections import defaultdict
import logging
import os

class ZoteroGroupAnalyzer:
    def __init__(self, api_key):
        """
        Initialize the analyzer with specific Nechako group libraries using API.
        """
        # Initialize with group IDs and API key
        self.libraries = {
            'portal': zotero.Zotero('364018', 'group', api_key),
            'search': zotero.Zotero('5494504', 'group', api_key)
        }
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Verify connections
        for name, zot in self.libraries.items():
            try:
                # Get first page of items to verify connection
                items = zot.items(limit=1)
                total = zot.count_items()
                self.logger.info(f"Successfully connected to {name} library. Total items: {total}")
            except Exception as e:
                self.logger.error(f"Failed to connect to {name} library: {str(e)}")
                raise

    def extract_library_structure(self, library_name):
        """
        Extract complete structure of a group library including collections and items.
        """
        zot = self.libraries[library_name]
        structure = {
            'collections': [],
            'items': [],
            'collection_items': []
        }
        
        try:
            # Get all collections
            collections = zot.collections()
            self.logger.info(f"Found {len(collections)} collections in {library_name}")
            
            for coll in collections:
                structure['collections'].append({
                    'key': coll['key'],
                    'name': coll['data']['name'],
                    'parent': coll['data'].get('parentCollection', None)
                })
                
                # Get items in this collection
                try:
                    items = zot.collection_items(coll['key'])
                    for item in items:
                        structure['collection_items'].append({
                            'collection_key': coll['key'],
                            'item_key': item['key'],
                            'collection_name': coll['data']['name'],
                            'title': item['data'].get('title', ''),
                            'item_type': item['data'].get('itemType', '')
                        })
                except Exception as e:
                    self.logger.warning(f"Error getting items for collection {coll['data']['name']}: {str(e)}")
            
            # Get all items
            self.logger.info(f"Retrieving all items from {library_name}...")
            all_items = zot.everything(zot.items())
            structure['items'] = all_items
            self.logger.info(f"Retrieved {len(all_items)} items from {library_name}")
            
            return structure
            
        except Exception as e:
            self.logger.error(f"Error extracting structure from {library_name}: {str(e)}")
            raise

    def analyze_duplicates(self):
        """
        Compare the two group libraries to find duplicate entries.
        """
        try:
            # Get items from both libraries
            self.logger.info("Retrieving items for duplicate analysis...")
            portal_items = {
                item['data']['title'].lower(): item 
                for item in self.libraries['portal'].everything(self.libraries['portal'].items())
                if 'title' in item['data']
            }
            
            search_items = {
                item['data']['title'].lower(): item 
                for item in self.libraries['search'].everything(self.libraries['search'].items())
                if 'title' in item['data']
            }
            
            # Find duplicates
            duplicate_titles = set(portal_items.keys()) & set(search_items.keys())
            
            # Create detailed duplicate information
            duplicates = []
            for title in duplicate_titles:
                duplicates.append({
                    'title': title,
                    'portal_type': portal_items[title]['data'].get('itemType', ''),
                    'search_type': search_items[title]['data'].get('itemType', ''),
                    'portal_creators': '; '.join(
                        f"{c.get('lastName', '')}, {c.get('firstName', '')}"
                        for c in portal_items[title]['data'].get('creators', [])
                    ),
                    'search_creators': '; '.join(
                        f"{c.get('lastName', '')}, {c.get('firstName', '')}"
                        for c in search_items[title]['data'].get('creators', [])
                    )
                })
            
            # Create visualization
            plt.figure(figsize=(10, 10))
            venn2([set(portal_items.keys()), set(search_items.keys())],
                  set_labels=('Portal Library', 'Search Library'))
            plt.title('Library Content Overlap')
            plt.savefig('library_overlap.png')
            plt.close()
            
            return {
                'duplicates': duplicates,
                'stats': {
                    'portal_total': len(portal_items),
                    'search_total': len(search_items),
                    'duplicate_count': len(duplicate_titles)
                }
            }
            
        except Exception as e:
            self.logger.error(f"Error analyzing duplicates: {str(e)}")
            raise

    def export_analysis(self, output_dir='nechako_analysis'):
        """
        Export complete analysis to Excel files and visualizations.
        """
        try:
            # Create output directory if it doesn't exist
            os.makedirs(output_dir, exist_ok=True)
            
            # Analyze each library
            for lib_name in self.libraries:
                structure = self.extract_library_structure(lib_name)
                
                # Create Excel writer for this library
                excel_path = os.path.join(output_dir, f'{lib_name}_library_analysis.xlsx')
                with pd.ExcelWriter(excel_path) as writer:
                    # Collections sheet
                    collections_df = pd.DataFrame(structure['collections'])
                    collections_df.to_excel(writer, sheet_name='Collections', index=False)
                    
                    # Items sheet - extract key metadata
                    items_data = []
                    for item in structure['items']:
                        item_data = item['data']
                        items_data.append({
                            'key': item['key'],
                            'title': item_data.get('title', ''),
                            'type': item_data.get('itemType', ''),
                            'date': item_data.get('date', ''),
                            'creators': '; '.join(
                                f"{c.get('lastName', '')}, {c.get('firstName', '')}"
                                for c in item_data.get('creators', [])
                            ),
                            'tags': '; '.join(
                                t['tag'] for t in item_data.get('tags', [])
                            )
                        })
                    items_df = pd.DataFrame(items_data)
                    items_df.to_excel(writer, sheet_name='Items', index=False)
                    
                    # Collection-Items relationships
                    collection_items_df = pd.DataFrame(structure['collection_items'])
                    collection_items_df.to_excel(writer, sheet_name='Collection_Items', index=False)
            
            # Export duplicate analysis
            duplicate_analysis = self.analyze_duplicates()
            duplicate_path = os.path.join(output_dir, 'duplicate_analysis.xlsx')
            with pd.ExcelWriter(duplicate_path) as writer:
                # Duplicate items sheet
                duplicates_df = pd.DataFrame(duplicate_analysis['duplicates'])
                duplicates_df.to_excel(writer, sheet_name='Duplicates', index=False)
                
                # Statistics sheet
                stats_df = pd.DataFrame([duplicate_analysis['stats']])
                stats_df.to_excel(writer, sheet_name='Statistics', index=False)
            
            self.logger.info(f"Analysis exported to {output_dir}")
            
        except Exception as e:
            self.logger.error(f"Error exporting analysis: {str(e)}")
            raise

def main():
    # Replace this with your API key
    API_KEY = "lN8i6KgGqryvsXTHWohCiLVz"
    
    # Initialize analyzer
    analyzer = ZoteroGroupAnalyzer(API_KEY)
    
    # Export full analysis
    analyzer.export_analysis()
    
    # Print summary
    duplicate_analysis = analyzer.analyze_duplicates()
    print("\nAnalysis Summary:")
    print(f"Portal Library Items: {duplicate_analysis['stats']['portal_total']}")
    print(f"Search Library Items: {duplicate_analysis['stats']['search_total']}")
    print(f"Duplicate Items: {duplicate_analysis['stats']['duplicate_count']}")

if __name__ == "__main__":
    main()

INFO:__main__:Successfully connected to portal library. Total items: 5504
INFO:__main__:Successfully connected to search library. Total items: 1456
INFO:__main__:Found 31 collections in portal
INFO:__main__:Retrieving all items from portal...
INFO:__main__:Retrieved 5504 items from portal
INFO:__main__:Found 4 collections in search
INFO:__main__:Retrieving all items from search...
INFO:__main__:Retrieved 1456 items from search
INFO:__main__:Retrieving items for duplicate analysis...
INFO:__main__:Analysis exported to nechako_analysis
INFO:__main__:Retrieving items for duplicate analysis...



Analysis Summary:
Portal Library Items: 2166
Search Library Items: 768
Duplicate Items: 683
