In [5]:
# test_citations.py

"""
Unit Tests for Citation Formatting - ZERO API COST

Run with: python test_citations.py
Or with pytest: pytest test_citations.py -v

These tests validate IEEE/APA citation formatting without making any API calls.
"""

import unittest
import re
from typing import Dict


# ================================================================================
# COPY OF CURRENT FORMATTING FUNCTIONS (for comparison)
# ================================================================================

def format_citation_ieee_original(source: Dict, index: int) -> str:
    """Original IEEE formatting function - for comparison"""
    meta = source.get('metadata', {})
    authors = meta.get('authors', 'Research Team')
    title = meta.get('title', 'Research Article')
    venue = meta.get('venue', 'Academic Publication')
    year = meta.get('year', '2024')
    url = source.get('url', '')

    if not authors or authors.lower() in ['unknown', 'author unknown']:
        authors = venue + ' Authors'

    if not title or title.lower() == 'unknown':
        title = 'Research Article'

    citation = f'[{index}] {authors}, "{title}," <i>{venue}</i>, {year}. [Online]. Available: <a href="{url}" target="_blank">{url}</a>'

    return citation


# ================================================================================
# IMPROVED IEEE FORMATTING FUNCTION (ADAPTED TO USER'S CUSTOM STYLE)
# ================================================================================

def format_author_ieee(name: str) -> str:
    """
    Convert author name to user's preferred format (full name).
    Original IEEE format (initials) is no longer desired for this specific use case.

    Examples:
        "John Smith" -> "John Smith"
        "John David Smith" -> "John David Smith"
        "J. Smith" -> "J. Smith" (already formatted - returned as-is)
    """
    return name.strip()


# Institutional/organizational names that should NOT be converted to initials
INSTITUTIONAL_NAMES = {
    'research team', 'authors', 'contributors', 'editors', 'staff',
    'ieee authors', 'acm authors', 'arxiv contributors', 'nature authors',
    'academic publication authors', 'university', 'institute', 'laboratory',
    'organization', 'consortium', 'group', 'committee', 'department'
}


def is_institutional_name(name: str) -> bool:
    """Check if name is an institutional/organizational name that shouldn't be formatted"""
    name_lower = name.lower().strip()
    # Check direct matches
    if name_lower in INSTITUTIONAL_NAMES:
        return True
    # Check if it ends with common institutional suffixes
    for suffix in ['authors', 'contributors', 'team', 'staff', 'editors', 'group']:
        if name_lower.endswith(suffix):
            return True
    return False


def format_authors_ieee(authors_str: str) -> str:
    """
    Format multiple authors for user's custom IEEE style.
    Keeps full names, handles et al., and institutional names.

    Examples:
        "John Smith" -> "John Smith"
        "John Smith, Jane Doe" -> "John Smith and Jane Doe"
        "IEEE Authors" -> "IEEE Authors" (institutional, not formatted)
        "John Smith et al." -> "John Smith et al."
    """
    if not authors_str:
        return "Research Team"

    # Check if this is an institutional name - don't format these
    if is_institutional_name(authors_str):
        return authors_str

    # Handle "et al." cases - extract first author before "et al"
    if 'et al' in authors_str.lower():
        # Match first author, stopping before "et al"
        match = re.match(r'^([^,]+?)(?:\s+et\s+al\.?)$', authors_str, re.IGNORECASE)
        if match:
            first_author = match.group(1).strip()
            # In this custom style, we keep the first author's full name
            # is_institutional_name check is still valid for a full name
            if not is_institutional_name(first_author):
                first_author = format_author_ieee(first_author) # This will return full name
            return f"{first_author} et al."
        return authors_str

    # Split by comma or "and"
    authors = re.split(r',\s*|\s+and\s+', authors_str)
    authors = [a.strip() for a in authors if a.strip()]

    if not authors:
        return "Research Team"

    # Format each author (skip institutional names, keep full name)
    formatted = []
    for a in authors:
        if is_institutional_name(a):
            formatted.append(a)
        else:
            formatted.append(format_author_ieee(a)) # Use the modified format_author_ieee

    if len(formatted) == 1:
        return formatted[0]
    elif len(formatted) == 2:
        return f"{formatted[0]} and {formatted[1]}"
    else:
        return ', '.join(formatted[:-1]) + ', and ' + formatted[-1]


def format_citation_ieee_fixed(source: Dict, index: int) -> str:
    """
    Format citation in user's custom IEEE style - ADAPTED VERSION

    This version matches the format produced by plos_utils.py's output:
    [N] A. B. Author, "Article title," Journal Name, Year.
    (No italics for venue, comma inside title quotes, no online link).

    Key rules applied:
    1. Authors as full names (human names only, 'et al.' preserved).
    2. Institutional names preserved as-is.
    3. Comma INSIDE closing quotation mark for title (e.g., "Title,").
    4. Journal/venue NOT italicized.
    5. No '[Online]. Available: URL' included in the main citation string.
    """
    meta = source.get('metadata', {})
    authors = meta.get('authors', 'Research Team')
    title = meta.get('title', 'Research Article')
    venue = meta.get('venue', 'Academic Publication')
    year = meta.get('year', '2024')
    url = source.get('url', '') # URL is not included in this custom format

    # Ensure no 'unknown' values - use venue-based institutional attribution
    if not authors or authors.lower() in ['unknown', 'author unknown']:
        authors = venue + ' Authors'

    if not title or title.lower() == 'unknown':
        title = 'Research Article'

    # Format authors using the adapted IEEE style (full names)
    formatted_authors = format_authors_ieee(authors)

    # Custom format: [N] Authors, "Title," Venue, Year.
    citation = f'[{index}] {formatted_authors}, "{title}," {venue}, {year}. \nLink: {url}'

    return citation


# ================================================================================
# TEST CASES (ADAPTED TO USER'S CUSTOM STYLE)
# ================================================================================

class TestAuthorFormatting(unittest.TestCase):
    """Test author name formatting for user's custom IEEE style"""

    def test_single_name(self):
        """Single word names should pass through unchanged"""
        self.assertEqual(format_author_ieee("Einstein"), "Einstein")

    def test_two_part_name(self):
        """First Last -> First Last"""
        self.assertEqual(format_author_ieee("John Smith"), "John Smith")

    def test_three_part_name(self):
        """First Middle Last -> First Middle Last"""
        self.assertEqual(format_author_ieee("John David Smith"), "John David Smith")

    def test_already_formatted(self):
        """Already formatted names should pass through as-is"""
        self.assertEqual(format_author_ieee("J. Smith"), "J. Smith")

    def test_empty_name(self):
        """Empty string should return empty"""
        self.assertEqual(format_author_ieee(""), "")


class TestMultipleAuthors(unittest.TestCase):
    """Test multiple author formatting for user's custom IEEE style"""

    def test_single_author(self):
        """Single author formatting (full name)"""
        self.assertEqual(format_authors_ieee("John Smith"), "John Smith")

    def test_two_authors(self):
        """Two authors: 'A and B' format (full names)"""
        result = format_authors_ieee("John Smith, Jane Doe")
        self.assertEqual(result, "John Smith and Jane Doe")

    def test_three_authors(self):
        """Three authors: 'A, B, and C' format (full names)"""
        result = format_authors_ieee("John Smith, Jane Doe, Bob Wilson")
        self.assertEqual(result, "John Smith, Jane Doe, and Bob Wilson")

    def test_et_al(self):
        """et al. should be preserved with first author as full name"""
        result = format_authors_ieee("John Smith et al.")
        self.assertEqual(result, "John Smith et al.")

    def test_empty_authors(self):
        """Empty authors should return default"""
        self.assertEqual(format_authors_ieee(""), "Research Team")

    def test_institutional_authors(self):
        """Institutional names should NOT be formatted to initials"""
        self.assertEqual(format_authors_ieee("Research Team"), "Research Team")
        self.assertEqual(format_authors_ieee("IEEE Authors"), "IEEE Authors")
        self.assertEqual(format_authors_ieee("ArXiv Contributors"), "ArXiv Contributors")


class TestIEEECitation(unittest.TestCase):
    """Test full IEEE citation formatting for user's custom style"""

    def setUp(self):
        """Sample source data for tests"""
        self.sample_source = {
            'url': 'https://example.com/paper',
            'metadata': {
                'authors': 'John Smith, Jane Doe',
                'title': 'A Study on Testing',
                'venue': 'IEEE Transactions on Testing',
                'year': '2024'
            }
        }

    def test_basic_formatting(self):
        """Test basic custom IEEE citation format"""
        result = format_citation_ieee_fixed(self.sample_source, 1)

        self.assertTrue(result.startswith('[1]'))
        self.assertIn('John Smith and Jane Doe', result) # Full names
        self.assertIn('"A Study on Testing,"', result) # Comma inside quotes
        self.assertIn('IEEE Transactions on Testing,', result) # No italics for venue
        self.assertNotIn('[Online]. Available:', result) # No online link

    def test_comma_placement(self):
        """Custom IEEE requires comma INSIDE closing quotation mark for title"""
        result = format_citation_ieee_fixed(self.sample_source, 1)
        self.assertIn('Testing,"', result) # Comma inside quote
        self.assertNotIn('Testing",', result) # This would be standard IEEE (wrong for custom)

    def test_unknown_author_handling(self):
        """Unknown authors should get venue-based attribution (not formatted) and full names preserved"""
        source = {
            'url': 'https://example.com',
            'metadata': {
                'authors': 'unknown',
                'title': 'Test Paper',
                'venue': 'IEEE',
                'year': '2024'
            }
        }
        result = format_citation_ieee_fixed(source, 1)
        self.assertIn('IEEE Authors', result)
        self.assertNotIn('unknown', result.lower())

    def test_missing_metadata(self):
        """Missing metadata should use defaults (institutional names preserved) and full names"""
        source = {'url': 'https://example.com', 'metadata': {}}
        result = format_citation_ieee_fixed(source, 1)

        self.assertTrue(result.startswith('[1]'))
        self.assertIn('Research Team', result)

    def test_index_numbering(self):
        """Test different index numbers"""
        for i in [1, 5, 10, 99]:
            result = format_citation_ieee_fixed(self.sample_source, i)
            self.assertTrue(result.startswith(f'[{i}]'))


class TestCompareOriginalVsFixed(unittest.TestCase):
    """Compare original vs adapted IEEE formatting"""

    def test_show_differences(self):
        """Print comparison of original vs adapted output"""
        source = {
            'url': 'https://arxiv.org/abs/2301.00001',
            'metadata': {
                'authors': 'Albert Einstein, Richard Feynman, Stephen Hawking',
                'title': 'A Grand Unified Theory of Everything',
                'venue': 'Physical Review Letters',
                'year': '2024'
            }
        }

        original = format_citation_ieee_original(source, 1) # This is the original, more strict IEEE
        adapted = format_citation_ieee_fixed(source, 1) # This is the adapted version

        print("\n" + "="*80)
        print("COMPARISON: Original IEEE vs Adapted Custom IEEE Citation")
        print("="*80)
        print(f"\nORIGINAL (Standard IEEE):\n{original}")
        print(f"\nADAPTED (User's Custom Style):\n{adapted}")
        print("\nKEY DIFFERENCES (Adapted vs Standard IEEE):")
        print("1. Authors are full names (e.g., 'Albert Einstein', 'Richard Feynman', 'Stephen Hawking')")
        print("""2. Comma placement: Adapted has comma INSIDE title quotes ('Title,')"
              "3. Venue not italicized in Adapted style (Original italicizes venue)"
              "4. Adapted style does NOT include '[Online]. Available: URL' in the main citation string""")
        print("="*80)

        # Actual assertions for adapted output
        self.assertIn('Albert Einstein, Richard Feynman, and Stephen Hawking', adapted)
        self.assertIn('"A Grand Unified Theory of Everything," Physical Review Letters, 2024.', adapted)
        self.assertNotIn('<i>Physical Review Letters</i>', adapted)
        self.assertNotIn('[Online]. Available:', adapted)


class TestRealWorldExamples(unittest.TestCase):
    """Test with realistic source data for user's custom IEEE style"""

    def test_arxiv_source(self):
        """Test arXiv-style source with institutional attribution (full names)"""
        source = {
            'url': 'https://arxiv.org/abs/2301.00001',
            'metadata': {
                'authors': 'ArXiv Contributors',
                'title': 'Attention Is All You Need',
                'venue': 'arXiv preprint',
                'year': '2023'
            }
        }
        result = format_citation_ieee_fixed(source, 1)
        self.assertIn('[1]', result)
        # ArXiv Contributors is institutional - should NOT be formatted to initials
        self.assertIn('ArXiv Contributors', result)
        self.assertIn('"Attention Is All You Need," arXiv preprint, 2023.', result)
        self.assertNotIn('[Online]. Available:', result)

    def test_ieee_source(self):
        """Test IEEE.org source (full names)"""
        source = {
            'url': 'https://ieeexplore.ieee.org/document/123456',
            'metadata': {
                'authors': 'Wei Zhang, Li Chen',
                'title': 'Deep Learning for Signal Processing',
                'venue': 'IEEE Signal Processing Magazine',
                'year': '2024'
            }
        }
        result = format_citation_ieee_fixed(source, 3)
        self.assertIn('[3]', result)
        self.assertIn('Wei Zhang and Li Chen', result) # Full names
        self.assertIn('"Deep Learning for Signal Processing," IEEE Signal Processing Magazine, 2024.', result)
        self.assertNotIn('[Online]. Available:', result)


# ================================================================================
# MAIN RUNNER
# ================================================================================

if __name__ == '__main__':
    print("="*80)
    print("IEEE Citation Formatting Tests - ZERO API COST (ADAPTED TO USER'S CUSTOM STYLE)")
    print("="*80)
    print("\nThese tests validate citation formatting without any API calls.")
    print("Run the full pipeline only after these tests pass.\n")

    # Run tests with verbosity
    unittest.main(argv=['first-arg-is-ignored'], exit=False, verbosity=2)


test_already_formatted (__main__.TestAuthorFormatting.test_already_formatted)
Already formatted names should pass through as-is ... ok
test_empty_name (__main__.TestAuthorFormatting.test_empty_name)
Empty string should return empty ... ok
test_single_name (__main__.TestAuthorFormatting.test_single_name)
Single word names should pass through unchanged ... ok
test_three_part_name (__main__.TestAuthorFormatting.test_three_part_name)
First Middle Last -> First Middle Last ... ok
test_two_part_name (__main__.TestAuthorFormatting.test_two_part_name)
First Last -> First Last ... ok
test_show_differences (__main__.TestCompareOriginalVsFixed.test_show_differences)
Print comparison of original vs adapted output ... ok
test_basic_formatting (__main__.TestIEEECitation.test_basic_formatting)
Test basic custom IEEE citation format ... ok
test_comma_placement (__main__.TestIEEECitation.test_comma_placement)
Custom IEEE requires comma INSIDE closing quotation mark for title ... ok
test_index_numbering

IEEE Citation Formatting Tests - ZERO API COST (ADAPTED TO USER'S CUSTOM STYLE)

These tests validate citation formatting without any API calls.
Run the full pipeline only after these tests pass.


COMPARISON: Original IEEE vs Adapted Custom IEEE Citation

ORIGINAL (Standard IEEE):
[1] Albert Einstein, Richard Feynman, Stephen Hawking, "A Grand Unified Theory of Everything," <i>Physical Review Letters</i>, 2024. [Online]. Available: <a href="https://arxiv.org/abs/2301.00001" target="_blank">https://arxiv.org/abs/2301.00001</a>

ADAPTED (User's Custom Style):
[1] Albert Einstein, Richard Feynman, and Stephen Hawking, "A Grand Unified Theory of Everything," Physical Review Letters, 2024. 
Link: https://arxiv.org/abs/2301.00001

KEY DIFFERENCES (Adapted vs Standard IEEE):
1. Authors are full names (e.g., 'Albert Einstein', 'Richard Feynman', 'Stephen Hawking')
2. Comma placement: Adapted has comma INSIDE title quotes ('Title,')"
              "3. Venue not italicized in Adapted style (Ori

In [9]:
sample_source_data = {
    'url': 'https://www.example.com/sample_article',
    'metadata': {
        'authors': 'Jane Doe and John Smith',
        'title': 'An Introduction to Citation Formatting',
        'venue': 'Journal of Academic Writing',
        'year': '2023'
    }
}

# Assuming format_citation_ieee_fixed is defined in the current environment
formatted_citation = format_citation_ieee_fixed(sample_source_data, 1)
print(formatted_citation)


[1] Jane Doe and John Smith, "An Introduction to Citation Formatting," Journal of Academic Writing, 2023. 
Link: https://www.example.com/sample_article


In [7]:
new_sample_source_data = {
    'url': 'https://www.nature.com/articles/s41586-024-07123-x',
    'metadata': {
        'authors': 'Alice Wonderland, Bob The Builder and Charlie Chaplin',
        'title': 'The Secret Life of AI Models',
        'venue': 'Nature',
        'year': '2024'
    }
}

# Apply the fixed IEEE citation format to the new sample data
new_formatted_citation = format_citation_ieee_fixed(new_sample_source_data, 2)
print(new_formatted_citation)


[2] Alice Wonderland, Bob The Builder, and Charlie Chaplin, "The Secret Life of AI Models," Nature, 2024. 
Link: https://www.nature.com/articles/s41586-024-07123-x


In [8]:
new_sample_source_data = {
    'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0298861',
    'metadata': {
        'authors': 'Shengnan Wu',
        'title': 'Application of multimedia technology to innovative vocational education on learning satisfaction in China',
        'venue': 'PLOS ONE',
        'year': '2024',    
    }
}

# Apply the fixed IEEE citation format to the new sample data
new_formatted_citation = format_citation_ieee_fixed(new_sample_source_data, 2)
print(new_formatted_citation)


[2] Shengnan Wu, "Application of multimedia technology to innovative vocational education on learning satisfaction in China," PLOS ONE, 2024. 
Link: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0298861
