In [None]:
import xml.etree.ElementTree as ET
from collections import defaultdict
import pandas as pd
import os
import re
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import networkx as nx
import traceback

class ZoteroIntegrator:
    def __init__(self):
        self.namespaces = {
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'z': 'http://www.zotero.org/namespaces/export#',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'bib': 'http://purl.org/net/biblio#',
            'dcterms': 'http://purl.org/dc/terms/',
            'foaf': 'http://xmlns.com/foaf/0.1/'
        }

    def normalize_title(self, title):
        """Normalize title for comparison."""
        if not title:
            return ""
        title = re.sub(r'[^\w\s]', '', title.lower())
        return ' '.join(title.split())

    def extract_from_rdf(self, rdf_path):
        """Extract collection structure and items from RDF."""
        try:
            print(f"\nProcessing RDF: {rdf_path}")
            tree = ET.parse(rdf_path)
            root = tree.getroot()
            
            print("Extracting collections...")
            collections = {}
            collection_items = defaultdict(set)
            
            for collection in root.findall(f".//{{{self.namespaces['z']}}}Collection"):
                coll_id = collection.get(f"{{{self.namespaces['rdf']}}}about")
                if coll_id:
                    title = collection.find(f".//{{{self.namespaces['dc']}}}title")
                    if title is not None and title.text:
                        collections[coll_id] = {
                            'title': title.text,
                            'items': set(),
                            'parent': None
                        }
                        
                        # Get items in this collection
                        for item_ref in collection.findall(f".//{{{self.namespaces['dcterms']}}}hasPart"):
                            item_id = item_ref.get(f"{{{self.namespaces['rdf']}}}resource")
                            if item_id:
                                collections[coll_id]['items'].add(item_id)
                                collection_items[title.text].add(item_id)
            
            print(f"Found {len(collections)} collections")
            
            print("Extracting items...")
            items = {}
            for item in root.findall(".//*[@rdf:about]", self.namespaces):
                item_id = item.get(f"{{{self.namespaces['rdf']}}}about")
                if item_id:
                    title = item.find(f".//{{{self.namespaces['dc']}}}title")
                    if title is not None and title.text:
                        items[item_id] = {
                            'title': title.text,
                            'normalized_title': self.normalize_title(title.text),
                            'collections': set()
                        }
            
            print(f"Found {len(items)} items")
            return collections, items, collection_items
            
        except Exception as e:
            print(f"Error in extract_from_rdf: {str(e)}")
            traceback.print_exc()
            return {}, {}, defaultdict(set)

    def extract_from_dedup_xml(self, xml_path):
        try:
            print(f"\nProcessing deduplicated XML: {xml_path}")
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            records = []
            for record in root.findall('.//record'):
                try:
                    rec_data = {}
                    
                    # Extract title with style handling
                    titles = record.find('.//titles')
                    if titles is not None:
                        # Primary title
                        title = titles.find('title')
                        if title is not None:
                            style = title.find('.//style')
                            if style is not None and style.text:
                                rec_data['title'] = style.text.strip()
                                rec_data['normalized_title'] = self.normalize_title(style.text)
                        
                        # Secondary title (journal)
                        secondary = titles.find('secondary-title')
                        if secondary is not None:
                            style = secondary.find('.//style')
                            if style is not None and style.text:
                                rec_data['journal'] = style.text.strip()
                    
                    # Extract authors
                    authors = []
                    contributors = record.find('.//contributors')
                    if contributors is not None:
                        for author in contributors.findall('.//author'):
                            style = author.find('.//style')
                            if style is not None and style.text:
                                authors.append(style.text.strip())
                    rec_data['authors'] = authors
                    
                    # Extract abstract
                    abstract = record.find('.//abstract')
                    if abstract is not None:
                        style = abstract.find('.//style')
                        if style is not None and style.text:
                            rec_data['abstract'] = style.text.strip()
                    
                    # Extract DOI/URL
                    electronic_num = record.find('.//electronic-resource-num')
                    if electronic_num is not None:
                        style = electronic_num.find('.//style')
                        if style is not None and style.text:
                            rec_data['doi'] = style.text.strip()
                    
                    # Extract type
                    ref_type = record.find('.//ref-type')
                    if ref_type is not None:
                        rec_data['type'] = ref_type.get('name')
                    
                    # Only add records with titles
                    if 'title' in rec_data:
                        records.append(rec_data)
                        
                except Exception as e:
                    print(f"Error processing record: {str(e)}")
                    continue
            
            print(f"Extracted {len(records)} records from XML")
            return records
            
        except Exception as e:
            print(f"Error in extract_from_dedup_xml: {str(e)}")
            traceback.print_exc()
            return []

    def visualize_hierarchy(self, collections, title, output_path):
        """Create visualization of collection hierarchy."""
        try:
            print(f"\nCreating hierarchy visualization: {title}")
            G = nx.DiGraph()
            
            # Add nodes and edges
            for coll_id, coll in collections.items():
                G.add_node(coll['title'])
                if coll.get('parent') and coll['parent'] in collections:
                    parent_title = collections[coll['parent']]['title']
                    G.add_edge(parent_title, coll['title'])
            
            plt.figure(figsize=(15, 10))
            pos = nx.spring_layout(G)
            nx.draw(G, pos, with_labels=True, node_color='lightblue', 
                    node_size=2000, font_size=8, font_weight='bold',
                    arrows=True)
            plt.title(title)
            plt.savefig(output_path)
            plt.close()
            print(f"Saved visualization to: {output_path}")
            
        except Exception as e:
            print(f"Error in visualize_hierarchy: {str(e)}")
            traceback.print_exc()

    def create_analysis(self, portal_data, search_data, output_dir):
        """Create comprehensive analysis combining RDF and XML data."""
        try:
            print("\nCreating integrated analysis...")
            collections_p, items_p, coll_items_p = portal_data['rdf']
            collections_s, items_s, coll_items_s = search_data['rdf']
            records_p = portal_data['xml']
            records_s = search_data['xml']
            
            # Create output directories
            vis_dir = os.path.join(output_dir, 'visualizations')
            os.makedirs(vis_dir, exist_ok=True)
            
            print("Creating visualizations...")
            self.visualize_hierarchy(collections_p, 'Portal Collections', 
                                   os.path.join(vis_dir, 'portal_hierarchy.png'))
            self.visualize_hierarchy(collections_s, 'Search Collections',
                                   os.path.join(vis_dir, 'search_hierarchy.png'))
            
            print("Creating Excel report...")
            excel_path = os.path.join(output_dir, 'integrated_analysis.xlsx')
            with pd.ExcelWriter(excel_path) as writer:
                # Collection statistics
                print("- Creating collection statistics...")
                coll_stats = []
                for coll_title, items in coll_items_p.items():
                    coll_stats.append({
                        'Collection': coll_title,
                        'Library': 'Portal',
                        'Total Items': len(items),
                        'Unique Items': len({items_p[i]['normalized_title'] for i in items if i in items_p})
                    })
                for coll_title, items in coll_items_s.items():
                    coll_stats.append({
                        'Collection': coll_title,
                        'Library': 'Search',
                        'Total Items': len(items),
                        'Unique Items': len({items_s[i]['normalized_title'] for i in items if i in items_s})
                    })
                pd.DataFrame(coll_stats).to_excel(writer, sheet_name='Collection Stats', index=False)
                
                # Duplicate analysis
                print("- Analyzing duplicates...")
                portal_titles = {r['normalized_title'] for r in records_p}
                search_titles = {r['normalized_title'] for r in records_s}
                duplicates = portal_titles.intersection(search_titles)
                
                dupes_data = []
                for title in duplicates:
                    portal_colls = [coll for coll, items in coll_items_p.items() 
                                  if any(items_p.get(i, {}).get('normalized_title') == title for i in items)]
                    search_colls = [coll for coll, items in coll_items_s.items()
                                  if any(items_s.get(i, {}).get('normalized_title') == title for i in items)]
                    dupes_data.append({
                        'Title': title,
                        'Portal Collections': '; '.join(portal_colls),
                        'Search Collections': '; '.join(search_colls)
                    })
                pd.DataFrame(dupes_data).to_excel(writer, sheet_name='Duplicates', index=False)
                
                # Summary
                print("- Creating summary...")
                summary = pd.DataFrame([
                    {'Metric': 'Total Portal Items', 'Value': len(records_p)},
                    {'Metric': 'Total Search Items', 'Value': len(records_s)},
                    {'Metric': 'Duplicate Items', 'Value': len(duplicates)},
                    {'Metric': 'Portal Collections', 'Value': len(collections_p)},
                    {'Metric': 'Search Collections', 'Value': len(collections_s)}
                ])
                summary.to_excel(writer, sheet_name='Summary', index=False)
            
            print(f"Analysis saved to: {excel_path}")
            
        except Exception as e:
            print(f"Error in create_analysis: {str(e)}")
            traceback.print_exc()

def main():
    try:
        base_dir = '/Users/ahmadjalil/Desktop/Zotero Project'
        files = {
            'portal': {
                'rdf': os.path.join(base_dir, 'Nechako Portal/Nechako Portal.rdf'),
                'xml': os.path.join(base_dir, 'Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml')
            },
            'search': {
                'rdf': os.path.join(base_dir, 'Nechako Saturation Search/Nechako Saturation Search (2024-04).rdf'),
                'xml': os.path.join(base_dir, 'Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml')
            }
        }
        
        integrator = ZoteroIntegrator()
        
        # Process Portal library
        print("\nProcessing Portal library...")
        portal_data = {
            'rdf': integrator.extract_from_rdf(files['portal']['rdf']),
            'xml': integrator.extract_from_dedup_xml(files['portal']['xml'])
        }
        
        # Process Search library
        print("\nProcessing Search library...")
        search_data = {
            'rdf': integrator.extract_from_rdf(files['search']['rdf']),
            'xml': integrator.extract_from_dedup_xml(files['search']['xml'])
        }
        
        # Generate analysis
        output_dir = os.path.join(base_dir, 'Integrated_Results')
        os.makedirs(output_dir, exist_ok=True)
        
        print("\nGenerating integrated analysis...")
        integrator.create_analysis(portal_data, search_data, output_dir)
        
        print("\nAnalysis complete! Results saved to:", output_dir)
        print("\nCreated:")
        print("1. Collection hierarchy visualizations")
        print("2. Integrated analysis Excel file with:")
        print("   - Collection statistics")
        print("   - Duplicate analysis")
        print("   - Summary metrics")
        
    except Exception as e:
        print(f"Error in main: {str(e)}")
        traceback.print_exc()

if __name__ == "__main__":
    main()


Processing Portal library...

Processing RDF: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Nechako Portal.rdf

Processing deduplicated XML: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Portal/Deduplicator/Untitled_deduplicated 2024-12-02_Time0909.xml

Processing Search library...

Processing RDF: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Nechako Saturation Search (2024-04).rdf

Processing deduplicated XML: /Users/ahmadjalil/Desktop/Zotero Project/Nechako Saturation Search/Deduplicator/Untitled_deduplicated 2024-12-02_Time0911.xml

Generating integrated analysis...
Error in main: 'ZoteroIntegrator' object has no attribute 'create_analysis'


Traceback (most recent call last):
  File "/var/folders/q8/4q55gl35679357bs417130z40000gn/T/ipykernel_94336/541069924.py", line 238, in main
    integrator.create_analysis(portal_data, search_data, output_dir)
    ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'ZoteroIntegrator' object has no attribute 'create_analysis'


In [None]:
!pip install plotly seaborn networkx

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import defaultdict
import plotly.graph_objects as go
import plotly.express as px

def convert_to_dataframes(portal_data, search_data):
    """Convert the extracted data into pandas DataFrames for analysis."""
    # Unpack the data
    collections_p, items_p, coll_items_p = portal_data['rdf']
    collections_s, items_s, coll_items_s = search_data['rdf']
    records_p = portal_data['xml']
    records_s = search_data['xml']
    
    # Create items DataFrame
    items_data = []
    
    # Add Portal items
    for record in records_p:
        for coll_title, items in coll_items_p.items():
            # Check if this record's title matches any item in this collection
            if any(record['normalized_title'] == items_p.get(item_id, {}).get('normalized_title') 
                   for item_id in items if item_id in items_p):
                items_data.append({
                    'Title': record.get('title', ''),
                    'normalized_title': record.get('normalized_title', ''),
                    'Collections': coll_title,
                    'library': 'Portal',
                    'type': record.get('type', ''),
                    'authors': record.get('authors', []),
                    'doi': record.get('doi', ''),
                    'journal': record.get('journal', '')
                })
    
    # Add Search items
    for record in records_s:
        for coll_title, items in coll_items_s.items():
            if any(record['normalized_title'] == items_s.get(item_id, {}).get('normalized_title')
                   for item_id in items if item_id in items_s):
                items_data.append({
                    'Title': record.get('title', ''),
                    'normalized_title': record.get('normalized_title', ''),
                    'Collections': coll_title,
                    'library': 'Search',
                    'type': record.get('type', ''),
                    'authors': record.get('authors', []),
                    'doi': record.get('doi', ''),
                    'journal': record.get('journal', '')
                })
    
    df_items = pd.DataFrame(items_data)
    
    # Create collections DataFrame
    collections_data = []
    for coll_title, items in coll_items_p.items():
        collections_data.append({
            'Collection': coll_title,
            'Library': 'Portal',
            'Items': len(items)
        })
    for coll_title, items in coll_items_s.items():
        collections_data.append({
            'Collection': coll_title,
            'Library': 'Search',
            'Items': len(items)
        })
    
    df_collections = pd.DataFrame(collections_data)
    
    return df_items, df_collections

def analyze_duplicate_concentrations(df_items, df_collections):
    """Analyze where duplicates are concentrated."""
    print("\nAnalyzing duplicate concentrations...")
    
    # Find duplicates
    dupes = df_items[df_items.duplicated(subset='normalized_title', keep=False)]
    
    # Count duplicates by collection
    dupe_counts = dupes.groupby('Collections')['Title'].count().sort_values(ascending=False)
    
    # Visualization
    plt.figure(figsize=(15, 8))
    dupe_counts.plot(kind='bar')
    plt.title('Duplicate Concentration by Collection')
    plt.xlabel('Collection')
    plt.ylabel('Number of Duplicates')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nDuplicate Statistics by Collection:")
    total_items = df_items.groupby('Collections')['Title'].count()
    for coll in dupe_counts.index:
        print(f"\n{coll}:")
        print(f"Total items: {total_items[coll]}")
        print(f"Duplicates: {dupe_counts[coll]}")
        print(f"Duplicate percentage: {(dupe_counts[coll]/total_items[coll]*100):.1f}%")

def visualize_collection_relationships(df_items):
    """Create visualization showing relationships between collections."""
    # Create a matrix of collection overlaps
    collections = df_items['Collections'].unique()
    overlap_matrix = pd.DataFrame(0, index=collections, columns=collections)
    
    # Calculate overlaps
    for title in df_items[df_items.duplicated(subset='normalized_title', keep=False)]['normalized_title'].unique():
        colls = df_items[df_items['normalized_title'] == title]['Collections'].unique()
        for c1 in colls:
            for c2 in colls:
                if c1 != c2:
                    overlap_matrix.loc[c1, c2] += 1
    
    # Create heatmap
    plt.figure(figsize=(15, 15))
    sns.heatmap(overlap_matrix, annot=True, fmt='g', cmap='YlOrRd')
    plt.title('Collection Overlap Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def analyze_document_types(df_items):
    """Analyze which types of documents tend to overlap."""
    # Count document types in duplicates
    dupes = df_items[df_items.duplicated(subset='normalized_title', keep=False)]
    type_counts = dupes.groupby(['type', 'library'])['Title'].count().unstack(fill_value=0)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    type_counts.plot(kind='bar', stacked=True)
    plt.title('Document Types in Duplicates')
    plt.xlabel('Document Type')
    plt.ylabel('Number of Items')
    plt.legend(title='Library')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nDocument Type Statistics in Duplicates:")
    for doc_type in type_counts.index:
        print(f"\n{doc_type}:")
        for library in type_counts.columns:
            print(f"{library}: {type_counts.loc[doc_type, library]} items")


def analyze_categorization_patterns(df_items):
    """Analyze how items are categorized differently between libraries."""
    # Find items that appear in both libraries
    dupes = df_items[df_items.duplicated(subset='normalized_title', keep=False)]
    
    # Create a mapping of categorization patterns
    categorization_patterns = defaultdict(int)
    for title in dupes['normalized_title'].unique():
        items = dupes[dupes['normalized_title'] == title]
        portal_cats = items[items['library'] == 'Portal']['Collections'].iloc[0]
        search_cats = items[items['library'] == 'Search']['Collections'].iloc[0]
        categorization_patterns[(portal_cats, search_cats)] += 1
    
    # Create visualization
    plt.figure(figsize=(15, 8))
    patterns_df = pd.DataFrame(
        [(p[0], p[1], c) for p, c in categorization_patterns.items()],
        columns=['Portal Category', 'Search Category', 'Count']
    )
    
    sns.heatmap(
        patterns_df.pivot(
            index='Portal Category',
            columns='Search Category',
            values='Count'
        ),
        annot=True,
        fmt='g',
        cmap='YlOrRd'
    )
    plt.title('Categorization Patterns Between Libraries')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Print interesting patterns
    print("\nNotable Categorization Patterns:")
    for (portal_cat, search_cat), count in sorted(
        categorization_patterns.items(),
        key=lambda x: x[1],
        reverse=True
    )[:10]:
        print(f"\nItems categorized as:")
        print(f"Portal: {portal_cat}")
        print(f"Search: {search_cat}")
        print(f"Count: {count} items")

# Example usage:
# Assuming df_items and df_collections are your existing DataFrames
analyze_duplicate_concentrations(df_items, df_collections)
visualize_collection_flow(df_items)
analyze_document_overlap(df_items)
analyze_categorization_patterns(df_items)



NameError: name 'portal_data' is not defined