In [1]:
from pyzotero import zotero
import pandas as pd
from tqdm import tqdm
import os
import logging
from time import sleep

def setup_logging():
    """Set up logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    return logging.getLogger(__name__)

def fetch_initial_data():
    """Fetch data from Zotero API and save to CSV files"""
    logger = setup_logging()
    
    # Create output directory
    output_dir = 'zotero_data'
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize Zotero connections
    libraries = {
        'portal': zotero.Zotero('364018', 'group', local=True),
        'search': zotero.Zotero('5494504', 'group', local=True)
    }
    
    for lib_name, zot in libraries.items():
        logger.info(f"Processing {lib_name} library...")
        
        try:
            # Get total count
            total = zot.count_items()
            
            # Fetch all items
            all_items = []
            start = 0
            limit = 50
            
            with tqdm(total=total, desc=f"Fetching {lib_name} items") as pbar:
                while start < total:
                    items = zot.items(start=start, limit=limit)
                    if not items:
                        break
                    
                    # Extract relevant fields from each item
                    for item in items:
                        data = item['data']
                        processed_item = {
                            'key': item['key'],
                            'title': data.get('title', ''),
                            'itemType': data.get('itemType', ''),
                            'creators': str(data.get('creators', [])),  # Convert list to string
                            'date': data.get('date', ''),
                            'DOI': data.get('DOI', ''),
                            'url': data.get('url', ''),
                            'collections': str(data.get('collections', [])),  # Convert list to string
                            'tags': str(data.get('tags', [])),  # Convert list to string
                            'abstractNote': data.get('abstractNote', '')
                        }
                        all_items.append(processed_item)
                    
                    pbar.update(len(items))
                    start += limit
                    sleep(0.5)  # Prevent overloading
            
            # Convert to DataFrame and save
            df = pd.DataFrame(all_items)
            csv_path = os.path.join(output_dir, f'{lib_name}_library.csv')
            df.to_csv(csv_path, index=False)
            logger.info(f"Saved {lib_name} library to {csv_path}")
            
            # Also save as pickle for preserving data types
            pickle_path = os.path.join(output_dir, f'{lib_name}_library.pkl')
            df.to_pickle(pickle_path)
            logger.info(f"Saved {lib_name} library to {pickle_path}")
            
        except Exception as e:
            logger.error(f"Error processing {lib_name} library: {str(e)}")
            raise

if __name__ == "__main__":
    fetch_initial_data()

2024-12-04 12:28:27 - Processing portal library...
Fetching portal items:  54%|█████▍    | 2600/4789 [00:14<00:12, 180.10it/s]


KeyboardInterrupt: 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import ast
import os

class ZoteroAnalyzer:
    def __init__(self, data_dir='zotero_data'):
        """Initialize analyzer with saved data"""
        self.data_dir = data_dir
        self.libraries = {}
        
        # Load saved data
        for lib_name in ['portal', 'search']:
            pickle_path = os.path.join(data_dir, f'{lib_name}_library.pkl')
            if os.path.exists(pickle_path):
                self.libraries[lib_name] = pd.read_pickle(pickle_path)
            else:
                raise FileNotFoundError(f"No saved data found for {lib_name} library")
    
    def analyze_duplicates(self):
        """Find duplicates between libraries"""
        # Get sets of titles from each library
        portal_titles = set(self.libraries['portal']['title'].str.lower())
        search_titles = set(self.libraries['search']['title'].str.lower())
        
        # Find duplicates
        duplicate_titles = portal_titles & search_titles
        
        # Create detailed duplicates DataFrame
        duplicates_data = []
        for title in duplicate_titles:
            portal_item = self.libraries['portal'][self.libraries['portal']['title'].str.lower() == title].iloc[0]
            search_item = self.libraries['search'][self.libraries['search']['title'].str.lower() == title].iloc[0]
            
            duplicates_data.append({
                'title': title,
                'portal_key': portal_item['key'],
                'search_key': search_item['key'],
                'item_type': portal_item['itemType']
            })
        
        duplicates_df = pd.DataFrame(duplicates_data)
        
        # Create visualization
        plt.figure(figsize=(10, 10))
        venn2([portal_titles, search_titles],
              set_labels=('Portal Library', 'Search Library'))
        plt.title('Library Content Overlap')
        plt.savefig('library_overlap.png')
        plt.close()
        
        return {
            'duplicates': duplicate_titles,
            'duplicates_df': duplicates_df,
            'stats': {
                'portal_total': len(portal_titles),
                'search_total': len(search_titles),
                'duplicate_count': len(duplicate_titles)
            }
        }
    
    def analyze_collections(self):
        """Analyze collection distribution"""
        collection_stats = {}
        
        for lib_name, df in self.libraries.items():
            # Convert string representation of collections back to list
            df['collections'] = df['collections'].apply(ast.literal_eval)
            
            # Count items per collection
            collection_counts = df.explode('collections')['collections'].value_counts()
            collection_stats[lib_name] = collection_counts
        
        return collection_stats

# Example usage
if __name__ == "__main__":
    analyzer = ZoteroAnalyzer()
    
    # Analyze duplicates
    duplicate_analysis = analyzer.analyze_duplicates()
    print("\nDuplicate Analysis:")
    print(f"Portal Library Items: {duplicate_analysis['stats']['portal_total']}")
    print(f"Search Library Items: {duplicate_analysis['stats']['search_total']}")
    print(f"Duplicate Items: {duplicate_analysis['stats']['duplicate_count']}")
    
    # Analyze collections
    collection_stats = analyzer.analyze_collections()
    print("\nCollection Statistics:")
    for lib_name, stats in collection_stats.items():
        print(f"\n{lib_name.title()} Library Collections:")
        print(stats.head())


Duplicate Analysis:
Portal Library Items: 100
Search Library Items: 71
Duplicate Items: 1

Collection Statistics:

Portal Library Collections:
collections
WTI7SWR5    8352
Name: count, dtype: int64

Search Library Collections:
collections
MRW66E3Y    1134
4EFHUEJI     702
5JRIR9ZQ      54
EG5BYG2X      27
Name: count, dtype: int64


In [4]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

class XMLProcessor:
    def __init__(self):
        """Initialize the XML processor"""
        self.xml_data = {
            'portal': {
                'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml',
                'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml'
            },
            'search': {
                'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0911.xml',
                'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml'
            }
        }
        
    def parse_xml_file(self, file_path):
        """Parse XML file and extract relevant information"""
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            items = []
            for entry in root.findall('.//entry'):
                item = {}
                
                # Extract basic metadata
                item['title'] = entry.findtext('title', '')
                item['id'] = entry.findtext('id', '')
                item['updated'] = entry.findtext('updated', '')
                
                # Extract authors
                authors = []
                for author in entry.findall('.//author'):
                    authors.append(author.findtext('name', ''))
                item['authors'] = authors
                
                # Extract DOI if available
                doi = entry.find(".//zapi:DOI", namespaces={'zapi': 'http://zotero.org/ns/api'})
                item['DOI'] = doi.text if doi is not None else ''
                
                items.append(item)
            
            return pd.DataFrame(items)
        except Exception as e:
            print(f"Error parsing {file_path}: {str(e)}")
            return pd.DataFrame()

    def process_all_files(self):
        """Process all XML files and return consolidated data"""
        results = {}
        for lib_name, paths in self.xml_data.items():
            results[lib_name] = {
                'deduped': self.parse_xml_file(paths['deduped']),
                'duplicates': self.parse_xml_file(paths['duplicates'])
            }
        return results

    def match_with_zotero(self, xml_data, zotero_df):
        """Match XML entries with Zotero library entries"""
        matched_data = []
        
        for _, xml_row in xml_data.iterrows():
            # Try matching by DOI first
            if xml_row['DOI']:
                zotero_match = zotero_df[zotero_df['DOI'] == xml_row['DOI']]
                if not zotero_match.empty:
                    matched_data.append({
                        'xml_title': xml_row['title'],
                        'zotero_title': zotero_match.iloc[0]['title'],
                        'match_method': 'DOI',
                        'xml_id': xml_row['id'],
                        'zotero_key': zotero_match.iloc[0]['key']
                    })
                    continue
            
            # Try matching by title (case-insensitive)
            zotero_match = zotero_df[zotero_df['title'].str.lower() == xml_row['title'].lower()]
            if not zotero_match.empty:
                matched_data.append({
                    'xml_title': xml_row['title'],
                    'zotero_title': zotero_match.iloc[0]['title'],
                    'match_method': 'title',
                    'xml_id': xml_row['id'],
                    'zotero_key': zotero_match.iloc[0]['key']
                })
        
        return pd.DataFrame(matched_data)

def analyze_xml_and_zotero():
    """Main function to analyze XML files and match with Zotero data"""
    # Initialize processors
    xml_processor = XMLProcessor()
    zotero_analyzer = ZoteroAnalyzer()
    
    # Process XML files
    xml_results = xml_processor.process_all_files()
    
    # Match with Zotero data
    matches = {}
    for lib_name in ['portal', 'search']:
        matches[lib_name] = {
            'deduped': xml_processor.match_with_zotero(
                xml_results[lib_name]['deduped'],
                zotero_analyzer.libraries[lib_name]
            ),
            'duplicates': xml_processor.match_with_zotero(
                xml_results[lib_name]['duplicates'],
                zotero_analyzer.libraries[lib_name]
            )
        }
    
    return matches, xml_results

In [8]:
# Import all required libraries
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import ast
import os
import xml.etree.ElementTree as ET
import logging
from datetime import datetime

class XMLProcessor:
    def __init__(self):
        """Initialize the XML processor with logging"""
        self.logger = logging.getLogger('XMLProcessor')
        self.logger.setLevel(logging.DEBUG)
        
        # Add console handler if not already present
        if not self.logger.handlers:
            ch = logging.StreamHandler()
            ch.setLevel(logging.DEBUG)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)
            
        self.xml_data = {
            'portal': {
                'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml',
                'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml'
            },
            'search': {
                'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0911.xml',
                'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml'
            }
        }
        
    def parse_xml_file(self, file_path):
        """Parse XML file with detailed logging"""
        self.logger.info(f"Attempting to parse: {file_path}")
        
        try:
            if not os.path.exists(file_path):
                self.logger.error(f"File not found: {file_path}")
                return pd.DataFrame()
                
            tree = ET.parse(file_path)
            root = tree.getroot()
            
            # Log namespace information
            self.logger.debug(f"XML namespaces: {root.nsmap if hasattr(root, 'nsmap') else 'No namespaces found'}")
            
            items = []
            for entry in root.findall('.//entry'):
                try:
                    item = {}
                    
                    # Extract and log each field
                    item['title'] = entry.findtext('title', '')
                    item['id'] = entry.findtext('id', '')
                    item['updated'] = entry.findtext('updated', '')
                    
                    # Extract authors
                    authors = []
                    for author in entry.findall('.//author'):
                        authors.append(author.findtext('name', ''))
                    item['authors'] = '; '.join(authors)
                    
                    # Look for DOI in multiple possible locations
                    doi = None
                    doi_paths = [
                        ".//zapi:DOI",
                        ".//DOI",
                        ".//doi",
                        ".//identifier[@type='doi']"
                    ]
                    
                    for path in doi_paths:
                        try:
                            doi_elem = entry.find(path)
                            if doi_elem is not None:
                                doi = doi_elem.text
                                break
                        except Exception:
                            continue
                    
                    item['DOI'] = doi if doi else ''
                    
                    items.append(item)
                    
                except Exception as e:
                    self.logger.error(f"Error processing entry: {str(e)}")
                    continue
            
            df = pd.DataFrame(items)
            self.logger.info(f"Successfully parsed {len(df)} items from {file_path}")
            self.logger.debug(f"Columns found: {df.columns.tolist()}")
            return df
            
        except Exception as e:
            self.logger.error(f"Error parsing {file_path}: {str(e)}")
            return pd.DataFrame()

    def process_all_files(self):
        """Process all XML files with error handling"""
        results = {}
        for lib_name, paths in self.xml_data.items():
            self.logger.info(f"Processing {lib_name} library files")
            results[lib_name] = {}
            
            for data_type, file_path in paths.items():
                self.logger.info(f"Processing {data_type} file for {lib_name}")
                df = self.parse_xml_file(file_path)
                
                if not df.empty:
                    self.logger.info(f"Successfully processed {data_type} file for {lib_name}")
                    self.logger.debug(f"Sample data:\n{df.head()}")
                else:
                    self.logger.warning(f"No data processed for {data_type} file in {lib_name}")
                    
                results[lib_name][data_type] = df
                
        return results

def debug_analyze_xml_and_zotero():
    """Debug version with comprehensive logging"""
    # Set up logging
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler(f'zotero_debug_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
        ]
    )
    logger = logging.getLogger('ZoteroAnalysis')
    
    logger.info("Starting analysis")
    
    try:
        # Initialize processors
        logger.info("Initializing processors")
        xml_processor = XMLProcessor()
        zotero_analyzer = ZoteroAnalyzer()
        
        # Process XML files
        logger.info("Processing XML files")
        xml_results = xml_processor.process_all_files()
        
        # Debug output for XML results
        for lib_name in ['portal', 'search']:
            for data_type in ['deduped', 'duplicates']:
                df = xml_results[lib_name][data_type]
                if not df.empty:
                    logger.info(f"{lib_name} {data_type} data summary:")
                    logger.info(f"Shape: {df.shape}")
                    logger.info(f"Columns: {df.columns.tolist()}")
                    logger.info(f"Sample data:\n{df.head()}")
                else:
                    logger.warning(f"No data for {lib_name} {data_type}")
        
        # Debug output for Zotero data
        for lib_name, df in zotero_analyzer.libraries.items():
            logger.info(f"{lib_name} Zotero data summary:")
            logger.info(f"Shape: {df.shape}")
            logger.info(f"Columns: {df.columns.tolist()}")
            logger.info(f"Sample data:\n{df.head()}")
        
        return xml_results, zotero_analyzer.libraries
        
    except Exception as e:
        logger.error(f"Error in analysis: {str(e)}", exc_info=True)
        return None, None

# Run the debug analysis
xml_results, zotero_data = debug_analyze_xml_and_zotero()

# Print summary of results
if xml_results and zotero_data:
    print("\nAnalysis Summary:")
    for lib_name in ['portal', 'search']:
        print(f"\n{lib_name.title()} Library:")
        for data_type in ['deduped', 'duplicates']:
            if lib_name in xml_results and data_type in xml_results[lib_name]:
                df = xml_results[lib_name][data_type]
                print(f"  {data_type}: {len(df)} entries")
            else:
                print(f"  {data_type}: No data")
        
        if lib_name in zotero_data:
            print(f"  Zotero data: {len(zotero_data[lib_name])} entries")
        else:
            print("  Zotero data: No data")

2024-12-04 12:29:55 - Starting analysis
2024-12-04 12:29:55 - Initializing processors
2024-12-04 12:29:55 - Processing XML files
2024-12-04 12:29:55,615 - XMLProcessor - INFO - Processing portal library files
2024-12-04 12:29:55 - Processing portal library files
2024-12-04 12:29:55,615 - XMLProcessor - INFO - Processing deduped file for portal
2024-12-04 12:29:55 - Processing deduped file for portal
2024-12-04 12:29:55,616 - XMLProcessor - INFO - Attempting to parse: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml
2024-12-04 12:29:55 - Attempting to parse: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml
2024-12-04 12:29:55,628 - XMLProcessor - DEBUG - XML namespaces: No namespaces found
2024-12-04 12:29:55 - XML namespaces: No namespaces found
2024-12-04 12:29:55,629 - XMLProcessor - INFO - Successfully parsed 0 items from /Users/ahmadjalil/D


Analysis Summary:

Portal Library:
  deduped: 0 entries
  duplicates: 0 entries
  Zotero data: 9600 entries

Search Library:
  deduped: 0 entries
  duplicates: 0 entries
  Zotero data: 2700 entries


In [10]:
import pandas as pd
import xml.etree.ElementTree as ET

def parse_record(record):
    record_dict = {}
    
    # Extract elements with possible attributes
    elements_with_attrs = ['database', 'source-app', 'ref-type']
    for elem_name in elements_with_attrs:
        elem = record.find(elem_name)
        if elem is not None:
            # Text content
            record_dict[elem_name] = elem.text.strip() if elem.text else None
            # Attributes
            for attr_name, attr_value in elem.attrib.items():
                record_dict[f'{elem_name}_{attr_name}'] = attr_value.strip()
    
    # Extract simple elements
    simple_elements = ['pages', 'volume', 'number', 'issue', 'isbn',
                       'abstract', 'research-notes', 'language',
                       'electronic-resource-num', 'access-date']
    for elem_name in simple_elements:
        elem = record.find(elem_name)
        if elem is not None and elem.text:
            record_dict[elem_name] = elem.text.strip()
    
    # Extract titles
    titles_elem = record.find('titles')
    if titles_elem is not None:
        for title_type in ['title', 'secondary-title', 'short-title']:
            title_elem = titles_elem.find(title_type)
            if title_elem is not None and title_elem.text:
                record_dict[title_type] = title_elem.text.strip()
    
    # Extract periodical information
    periodical_elem = record.find('periodical')
    if periodical_elem is not None:
        for periodical_type in ['full-title', 'abbr-1']:
            periodical_sub_elem = periodical_elem.find(periodical_type)
            if periodical_sub_elem is not None and periodical_sub_elem.text:
                record_dict[periodical_type] = periodical_sub_elem.text.strip()
    
    # Extract authors
    authors = []
    for author_elem in record.findall('contributors/authors/author'):
        if author_elem.text:
            authors.append(author_elem.text.strip())
    record_dict['authors'] = '; '.join(authors) if authors else None
    
    # Extract keywords
    keywords = []
    for keyword_elem in record.findall('keywords/keyword'):
        if keyword_elem.text:
            keywords.append(keyword_elem.text.strip())
    record_dict['keywords'] = '; '.join(keywords) if keywords else None
    
    # Extract dates
    year_elem = record.find('dates/year')
    if year_elem is not None and year_elem.text:
        record_dict['year'] = year_elem.text.strip()
    pub_date_elem = record.find('dates/pub-dates/date')
    if pub_date_elem is not None and pub_date_elem.text:
        record_dict['pub-date'] = pub_date_elem.text.strip()
    
    # Extract URL
    url_elem = record.find('urls/web-urls/url')
    if url_elem is not None and url_elem.text:
        record_dict['url'] = url_elem.text.strip()
    
    return record_dict

def parse_xml_to_dataframe(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
    except ET.ParseError as e:
        print(f"Failed to parse XML file {file_path}: {e}")
        return pd.DataFrame()
    
    records = []
    for record in root.findall('.//record'):
        record_dict = parse_record(record)
        records.append(record_dict)
    
    df = pd.DataFrame(records)
    return df
# File paths dictionary
xml_data = {
    'portal': {
        'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0909.xml',
        'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml'
    },
    'search': {
        'deduped': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_removed_duplicates 2024-12-02_Time0911.xml',
        'duplicates': '/Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml'
    }
}

# Parse and display the head of each DataFrame
for category, paths in xml_data.items():
    for status, path in paths.items():
        print(f"\n{category.upper()} - {status.upper()}:")
        try:
            df = parse_xml_to_dataframe(path)
            if not df.empty:
                print(df.head())
            else:
                print(f"No data found in {path}")
        except Exception as e:
            print(f"Failed to process {path}: {e}")



PORTAL - DEDUPED:
      database database_name   database_path source-app source-app_name  \
0  EndNote.enl   EndNote.enl  c:\EndNote.enl    EndNote         EndNote   
1  EndNote.enl   EndNote.enl  c:\EndNote.enl    EndNote         EndNote   
2  EndNote.enl   EndNote.enl  c:\EndNote.enl    EndNote         EndNote   
3  EndNote.enl   EndNote.enl  c:\EndNote.enl    EndNote         EndNote   
4  EndNote.enl   EndNote.enl  c:\EndNote.enl    EndNote         EndNote   

  source-app_version ref-type    ref-type_name authors keywords  
0               16.0       17  Journal Article    None     None  
1               16.0       17  Journal Article    None     None  
2               16.0       17  Journal Article    None     None  
3               16.0       17  Journal Article    None     None  
4               16.0       17  Journal Article    None     None  

PORTAL - DUPLICATES:
      database database_name   database_path source-app source-app_name  \
0  EndNote.enl   EndNote.enl  c:\EndN