In [1]:
# Cell 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from pyzotero import zotero
import logging
from time import sleep
import os
import ast
import xml.etree.ElementTree as ET
from datetime import datetime
from tqdm import tqdm

In [2]:
# Cell 2: Setup Logging
def setup_logging():
    """Set up logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    return logging.getLogger(__name__)

logger = setup_logging()


In [3]:
# Cell 3: Data Fetching
def fetch_initial_data():
    """Fetch data from Zotero API and save to CSV files"""
    # Create output directory
    output_dir = 'zotero_data'
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize Zotero connections
    libraries = {
        'portal': zotero.Zotero('364018', 'group', local=True),
        'search': zotero.Zotero('5494504', 'group', local=True)
    }
    
    for lib_name, zot in libraries.items():
        logger.info(f"Processing {lib_name} library...")
        
        try:
            total = zot.count_items()
            all_items = []
            start = 0
            limit = 50
            
            with tqdm(total=total, desc=f"Fetching {lib_name} items") as pbar:
                while start < total:
                    items = zot.items(start=start, limit=limit)
                    if not items:
                        break
                    
                    for item in items:
                        data = item['data']
                        processed_item = {
                            'key': item['key'],
                            'title': data.get('title', ''),
                            'itemType': data.get('itemType', ''),
                            'creators': str(data.get('creators', [])),
                            'date': data.get('date', ''),
                            'DOI': data.get('DOI', ''),
                            'url': data.get('url', ''),
                            'collections': str(data.get('collections', [])),
                            'tags': str(data.get('tags', [])),
                            'abstractNote': data.get('abstractNote', '')
                        }
                        all_items.append(processed_item)
                    
                    pbar.update(len(items))
                    start += limit
                    sleep(0.5)
            
            df = pd.DataFrame(all_items)
            df.to_pickle(os.path.join(output_dir, f'{lib_name}_library.pkl'))
            logger.info(f"Saved {lib_name} library")
            
        except Exception as e:
            logger.error(f"Error processing {lib_name} library: {str(e)}")
            raise


In [4]:
# Cell 4: Analysis Class
class ZoteroAnalyzer:
    def __init__(self, data_dir='zotero_data'):
        self.data_dir = data_dir
        self.libraries = {}
        
        for lib_name in ['portal', 'search']:
            pickle_path = os.path.join(data_dir, f'{lib_name}_library.pkl')
            if os.path.exists(pickle_path):
                self.libraries[lib_name] = pd.read_pickle(pickle_path)
            else:
                raise FileNotFoundError(f"No saved data found for {lib_name} library")
    
    def analyze_duplicates(self):
        portal_titles = set(self.libraries['portal']['title'].str.lower())
        search_titles = set(self.libraries['search']['title'].str.lower())
        duplicate_titles = portal_titles & search_titles
        
        duplicates_data = []
        for title in duplicate_titles:
            portal_item = self.libraries['portal'][self.libraries['portal']['title'].str.lower() == title].iloc[0]
            search_item = self.libraries['search'][self.libraries['search']['title'].str.lower() == title].iloc[0]
            
            duplicates_data.append({
                'title': title,
                'portal_key': portal_item['key'],
                'search_key': search_item['key'],
                'item_type': portal_item['itemType']
            })
        
        duplicates_df = pd.DataFrame(duplicates_data)
        
        # Create visualization
        plt.figure(figsize=(10, 10))
        venn2([portal_titles, search_titles],
              set_labels=('Portal Library', 'Search Library'))
        plt.title('Library Content Overlap')
        plt.savefig('library_overlap.png')
        plt.close()
        
        return {
            'duplicates': duplicate_titles,
            'duplicates_df': duplicates_df,
            'stats': {
                'portal_total': len(portal_titles),
                'search_total': len(search_titles),
                'duplicate_count': len(duplicate_titles)
            }
        }
    
    def analyze_collections(self):
        collection_stats = {}
        
        for lib_name, df in self.libraries.items():
            df['collections'] = df['collections'].apply(ast.literal_eval)
            collection_counts = df.explode('collections')['collections'].value_counts()
            collection_stats[lib_name] = collection_counts
        
        return collection_stats

In [6]:
# Cell 5: Run Analysis
if __name__ == "__main__":
    # First fetch the data
    fetch_initial_data()
    
    # Then analyze
    analyzer = ZoteroAnalyzer()
    
    # Analyze duplicates
    duplicate_analysis = analyzer.analyze_duplicates()
    print("\nDuplicate Analysis:")
    print(f"Portal Library Items: {duplicate_analysis['stats']['portal_total']}")
    print(f"Search Library Items: {duplicate_analysis['stats']['search_total']}")
    print(f"Duplicate Items: {duplicate_analysis['stats']['duplicate_count']}")
    
    # Analyze collections
    collection_stats = analyzer.analyze_collections()
    print("\nCollection Statistics:")
    for lib_name, stats in collection_stats.items():
        print(f"\n{lib_name.title()} Library Collections:")
        print(stats.head())

2024-12-16 10:08:30 - Processing portal library...
Fetching portal items: 9600it [00:53, 177.88it/s]                          
2024-12-16 10:09:24 - Saved portal library
2024-12-16 10:09:24 - Processing search library...
Fetching search items: 2700it [00:14, 181.80it/s]                          
2024-12-16 10:09:39 - Saved search library



Duplicate Analysis:
Portal Library Items: 100
Search Library Items: 71
Duplicate Items: 1

Collection Statistics:

Portal Library Collections:
collections
WTI7SWR5    8352
Name: count, dtype: int64

Search Library Collections:
collections
MRW66E3Y    1134
4EFHUEJI     702
5JRIR9ZQ      54
EG5BYG2X      27
Name: count, dtype: int64
