# 📖 Unpaywall Paper Download System

**Unpaywall** is a legal, ethical, and reliable way to download open access papers!

## ✅ **Why Unpaywall is Better:**
- **100% Legal**: Only provides legitimately open access papers
- **Reliable API**: Stable and well-documented
- **High Quality**: Provides official publisher PDFs
- **Comprehensive**: Covers millions of open access papers
- **Respectful**: Follows publisher guidelines

## 🎯 **What This Notebook Does:**
1. Uses the cleaned dataset with DOIs
2. Checks Unpaywall API for open access availability
3. Downloads only legally available papers
4. Organizes files by year and title
5. Tracks download status and provides statistics

In [10]:
# Import required libraries
import pandas as pd
import requests
import time
import os
import re
from pathlib import Path
import json
from urllib.parse import urlparse

print("📦 Libraries imported successfully!")
print("🌐 Ready to use Unpaywall API for legal paper downloads!")

📦 Libraries imported successfully!
🌐 Ready to use Unpaywall API for legal paper downloads!


In [11]:
# Load the cleaned dataset
print("📊 Loading cleaned dataset...")
data = pd.read_csv("All details_cleaned.csv")

print(f"✅ Dataset loaded successfully!")
print(f"📚 Total papers: {len(data)}")
print(f"🔍 Papers with DOI: {data['doi'].notna().sum()}")

# Filter for papers with valid DOIs
papers_with_doi = data[
    data['doi'].notna() & 
    data['doi'].str.startswith('10.', na=False)
].copy()

print(f"📋 Papers with valid DOIs: {len(papers_with_doi)}")

# Add tracking columns
papers_with_doi['is_open_access'] = False
papers_with_doi['oa_pdf_url'] = ""
papers_with_doi['downloaded'] = False
papers_with_doi['download_filename'] = ""
papers_with_doi['download_status'] = "Not checked"
papers_with_doi['oa_host_type'] = ""

print(f"🎯 Ready to check {len(papers_with_doi)} papers for open access availability!")

📊 Loading cleaned dataset...
✅ Dataset loaded successfully!
📚 Total papers: 357
🔍 Papers with DOI: 251
📋 Papers with valid DOIs: 250
🎯 Ready to check 250 papers for open access availability!


In [13]:
# Unpaywall API functions

def clean_filename(title, max_length=80):
    """Clean paper title to create a valid filename"""
    if pd.isna(title):
        return "Unknown_Title"
    
    title = str(title)
    title = re.sub(r'[<>:"/\\|?*]', '', title)
    title = re.sub(r'[^\w\s\-.]', '', title)
    title = re.sub(r'\s+', '_', title.strip())
    
    if len(title) > max_length:
        title = title[:max_length]
    
    return title if title else "Unknown_Title"

def check_unpaywall(doi, email="researcher@university.edu"):
    """
    Check if paper is open access using Unpaywall API
    
    Parameters:
    - doi: DOI of the paper
    - email: Your email (required by Unpaywall, but can be generic)
    
    Returns:
    - dict with open access info or None if not available
    """
    try:
        # Unpaywall API endpoint
        url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
        
        headers = {
            'User-Agent': 'Python script for academic research'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        
        if response.status_code == 200:
            data = response.json()
            
            result = {
                'is_oa': data.get('is_oa', False),
                'oa_date': data.get('oa_date'),
                'journal_is_oa': data.get('journal_is_oa', False),
                'title': data.get('title', ''),
                'journal': data.get('journal_name', ''),
                'year': data.get('year'),
                'pdf_url': None,
                'host_type': None,
                'license': None
            }
            
            # Get best open access location
            if data.get('is_oa') and data.get('best_oa_location'):
                oa_location = data['best_oa_location']
                result['pdf_url'] = oa_location.get('url_for_pdf')
                result['host_type'] = oa_location.get('host_type', '')
                result['license'] = oa_location.get('license', '')
                
            return result
            
        elif response.status_code == 404:
            # DOI not found in Unpaywall
            return {'is_oa': False, 'error': 'DOI not found'}
        else:
            return {'is_oa': False, 'error': f'API error: {response.status_code}'}
            
    except requests.exceptions.Timeout:
        return {'is_oa': False, 'error': 'Timeout'}
    except Exception as e:
        return {'is_oa': False, 'error': f'Error: {str(e)[:50]}'}

def download_oa_paper(pdf_url, title, year, doi, download_folder="unpaywall_papers"):
    """Download open access paper from URL"""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    try:
        print(f"      📥 Downloading from: {urlparse(pdf_url).netloc}")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(pdf_url, headers=headers, timeout=30)
        
        if response.status_code == 200:
            content = response.content
            
            # Verify it's actually a PDF
            if content.startswith(b'%PDF') or 'pdf' in response.headers.get('content-type', '').lower():
                # Create filename
                clean_title = clean_filename(title, max_length=60)
                year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
                filename = f"{year_str}_{clean_title}.pdf"
                filepath = os.path.join(download_folder, filename)
                
                # Handle duplicates
                counter = 1
                base_filepath = filepath
                while os.path.exists(filepath):
                    name, ext = os.path.splitext(base_filepath)
                    filepath = f"{name}_{counter}{ext}"
                    counter += 1
                
                # Save file
                with open(filepath, 'wb') as f:
                    f.write(content)
                
                file_size = len(content) / 1024  # KB
                print(f"      ✅ Downloaded: {os.path.basename(filepath)} ({file_size:.0f} KB)")
                return True, os.path.basename(filepath), f"Downloaded from {urlparse(pdf_url).netloc}"
            else:
                return False, "", "URL did not return a valid PDF"
        else:
            return False, "", f"HTTP {response.status_code}"
            
    except Exception as e:
        return False, "", f"Download error: {str(e)[:50]}"

print("🔧 Unpaywall functions ready!")
print("💡 Tip: Unpaywall respects publisher copyrights and only provides legal open access papers")

🔧 Unpaywall functions ready!
💡 Tip: Unpaywall respects publisher copyrights and only provides legal open access papers


In [14]:
# Test Unpaywall with a few papers
print("🧪 Testing Unpaywall API with sample papers...")
print("="*80)

sample_papers = papers_with_doi.head(5)

for i, (_, row) in enumerate(sample_papers.iterrows(), 1):
    doi = row['doi']
    title = row.get('title', 'Unknown Title')
    
    print(f"\n[{i}/5] Testing: {title[:50]}...")
    print(f"DOI: {doi}")
    
    # Check Unpaywall
    oa_info = check_unpaywall(doi)
    
    if oa_info['is_oa']:
        print(f"   ✅ Open Access Available!")
        print(f"   📍 Host: {oa_info.get('host_type', 'Unknown')}")
        print(f"   📄 PDF URL: {oa_info.get('pdf_url', 'No URL')[:50]}...")
        if oa_info.get('license'):
            print(f"   📜 License: {oa_info['license']}")
    else:
        error_msg = oa_info.get('error', 'Not open access')
        print(f"   ❌ Not available: {error_msg}")
    
    time.sleep(1)  # Be respectful to API

print(f"\n🎯 Test completed! Ready for bulk processing.")

🧪 Testing Unpaywall API with sample papers...

[1/5] Testing: 1G laboratory-scale shaking table tests on reducti...
DOI: 10.1007/978-981-15-2184-3_82
   ❌ Not available: Not open access
   ❌ Not available: Not open access

[2/5] Testing: A Case Study on Buckling Stability of Piles in Liq...
DOI: 10.1007/978-3-030-34252-4_8

[2/5] Testing: A Case Study on Buckling Stability of Piles in Liq...
DOI: 10.1007/978-3-030-34252-4_8
   ❌ Not available: Not open access
   ❌ Not available: Not open access

[3/5] Testing: A case study on seismic response analysis of groun...
DOI: 10.6310/jog.201812_13(4).5

[3/5] Testing: A case study on seismic response analysis of groun...
DOI: 10.6310/jog.201812_13(4).5
   ❌ Not available: DOI not found
   ❌ Not available: DOI not found

[4/5] Testing: A catastrophic flowslide that overrides a liquefie...
DOI: 10.1002/esp.5144

[4/5] Testing: A catastrophic flowslide that overrides a liquefie...
DOI: 10.1002/esp.5144
   ❌ Not available: Not open access
   ❌ Not

In [None]:
# 🔍 DEBUG: Investigate specific DOIs that should be open access
print("🔍 DEBUGGING: Let's investigate specific DOIs...")
print("="*80)

def debug_unpaywall(doi, email="blank"):
    """
    Debug version that shows full API response
    """
    try:
        url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
        
        headers = {
            'User-Agent': 'Python script for academic research'
        }
        
        print(f"🌐 API URL: {url}")
        response = requests.get(url, headers=headers, timeout=15)
        print(f"📊 Status Code: {response.status_code}")
        
        if response.status_code == 200:
            data = response.json()
            
            print(f"✅ API Response received!")
            print(f"📋 Title: {data.get('title', 'No title')}")
            print(f"📅 Year: {data.get('year', 'Unknown')}")
            print(f"📚 Journal: {data.get('journal_name', 'Unknown')}")
            print(f"🔓 Is Open Access: {data.get('is_oa', False)}")
            print(f"📰 Journal is OA: {data.get('journal_is_oa', False)}")
            print(f"📅 OA Date: {data.get('oa_date', 'Not specified')}")
            
            # Check all OA locations
            oa_locations = data.get('oa_locations', [])
            print(f"📍 Total OA Locations Found: {len(oa_locations)}")
            
            if oa_locations:
                for i, location in enumerate(oa_locations, 1):
                    print(f"   Location {i}:")
                    print(f"      🌐 Host: {location.get('host_type', 'Unknown')}")
                    print(f"      🔗 Landing Page: {location.get('url', 'No URL')}")
                    print(f"      📄 PDF URL: {location.get('url_for_pdf', 'No PDF URL')}")
                    print(f"      📜 License: {location.get('license', 'No license')}")
                    print(f"      ✅ Is Best: {location.get('is_best', False)}")
                    print()
            
            # Check best location specifically
            best_location = data.get('best_oa_location')
            if best_location:
                print(f"🎯 BEST OA LOCATION:")
                print(f"   🌐 Host: {best_location.get('host_type', 'Unknown')}")
                print(f"   🔗 Landing Page: {best_location.get('url', 'No URL')}")
                print(f"   📄 PDF URL: {best_location.get('url_for_pdf', 'No PDF URL')}")
                print(f"   📜 License: {best_location.get('license', 'No license')}")
            else:
                print(f"❌ No best OA location found")
                
            return data
        else:
            print(f"❌ API Error: {response.status_code}")
            print(f"📄 Response: {response.text[:200]}...")
            return None
            
    except Exception as e:
        print(f"❌ Exception: {str(e)}")
        return None

# Test with a specific DOI (you can change this)
print("🧪 Let's test with a specific DOI that you know should be open access")
print("💡 Replace the DOI below with one you found on the Unpaywall website:")
print()

# Example DOI - replace this with one you found on Unpaywall website
test_doi = "10.1007/978-981-15-2184-3_82"  # Replace with your DOI
print(f"🔍 Testing DOI: {test_doi}")
print("-" * 60)

debug_result = debug_unpaywall(test_doi)

print()
print("💡 TROUBLESHOOTING TIPS:")
print("1. Copy a DOI from a paper you found on unpaywall.org")
print("2. Replace the 'test_doi' variable above with that DOI")
print("3. Run this cell again to see the full API response")
print("4. Check if the API shows 'is_oa': True but no PDF URL")
print("5. Some papers may be OA but PDF not directly accessible via API")

🔍 DEBUGGING: Let's investigate specific DOIs...
🧪 Let's test with a specific DOI that you know should be open access
💡 Replace the DOI below with one you found on the Unpaywall website:

🔍 Testing DOI: 10.1007/978-981-15-2184-3_82
------------------------------------------------------------
🌐 API URL: https://api.unpaywall.org/v2/10.1007/978-981-15-2184-3_82?email=2102091@che.buet.ac.bd
📊 Status Code: 200
✅ API Response received!
📋 Title: 1G laboratory-scale shaking table tests on reduction of liquefaction damage in sand using short gravel compaction piles
📅 Year: 2019
📚 Journal: Lecture notes in civil engineering
🔓 Is Open Access: False
📰 Journal is OA: False
📅 OA Date: Not specified
📍 Total OA Locations Found: 0
❌ No best OA location found

💡 TROUBLESHOOTING TIPS:
1. Copy a DOI from a paper you found on unpaywall.org
2. Replace the 'test_doi' variable above with that DOI
3. Run this cell again to see the full API response
4. Check if the API shows 'is_oa': True but no PDF URL
5. Some

In [16]:
# 🔧 IMPROVED UNPAYWALL FUNCTIONS - Better detection and handling

def check_unpaywall_improved(doi, email="researcher@university.edu"):
    """
    Improved version that handles more edge cases and provides better debugging
    """
    try:
        url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        
        if response.status_code == 200:
            data = response.json()
            
            result = {
                'is_oa': data.get('is_oa', False),
                'oa_date': data.get('oa_date'),
                'journal_is_oa': data.get('journal_is_oa', False),
                'title': data.get('title', ''),
                'journal': data.get('journal_name', ''),
                'year': data.get('year'),
                'pdf_url': None,
                'host_type': None,
                'license': None,
                'all_urls': [],  # Store all available URLs
                'debug_info': {}
            }
            
            # Store debug info
            result['debug_info'] = {
                'total_oa_locations': len(data.get('oa_locations', [])),
                'has_best_location': bool(data.get('best_oa_location')),
                'is_oa_flag': data.get('is_oa', False)
            }
            
            # Check all OA locations, not just the best one
            oa_locations = data.get('oa_locations', [])
            
            if oa_locations:
                for location in oa_locations:
                    url_for_pdf = location.get('url_for_pdf')
                    url_landing = location.get('url')
                    
                    # Collect all potential URLs
                    if url_for_pdf:
                        result['all_urls'].append({
                            'type': 'pdf',
                            'url': url_for_pdf,
                            'host': location.get('host_type', ''),
                            'license': location.get('license', '')
                        })
                    
                    if url_landing and url_landing != url_for_pdf:
                        result['all_urls'].append({
                            'type': 'landing',
                            'url': url_landing,
                            'host': location.get('host_type', ''),
                            'license': location.get('license', '')
                        })
            
            # Use best location if available
            best_location = data.get('best_oa_location')
            if best_location:
                result['pdf_url'] = best_location.get('url_for_pdf') or best_location.get('url')
                result['host_type'] = best_location.get('host_type', '')
                result['license'] = best_location.get('license', '')
            
            # Fallback: if no best location but other URLs available
            elif result['all_urls']:
                # Prefer direct PDF URLs
                pdf_urls = [u for u in result['all_urls'] if u['type'] == 'pdf']
                if pdf_urls:
                    best_url = pdf_urls[0]
                    result['pdf_url'] = best_url['url']
                    result['host_type'] = best_url['host']
                    result['license'] = best_url['license']
                else:
                    # Use landing page as fallback
                    best_url = result['all_urls'][0]
                    result['pdf_url'] = best_url['url']
                    result['host_type'] = best_url['host']
                    result['license'] = best_url['license']
            
            # Mark as OA if we found any URLs, even if API says not OA
            if result['all_urls'] or data.get('is_oa'):
                result['is_oa'] = True
                
            return result
            
        elif response.status_code == 404:
            return {'is_oa': False, 'error': 'DOI not found in Unpaywall database'}
        else:
            return {'is_oa': False, 'error': f'API error: {response.status_code}'}
            
    except requests.exceptions.Timeout:
        return {'is_oa': False, 'error': 'Request timeout'}
    except Exception as e:
        return {'is_oa': False, 'error': f'Error: {str(e)[:50]}'}

def smart_download_paper(pdf_url, title, year, doi, download_folder="unpaywall_papers"):
    """
    Improved download function that handles redirects and different content types
    """
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    try:
        print(f"      📥 Attempting download from: {urlparse(pdf_url).netloc}")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'application/pdf,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        # Follow redirects and handle different response types
        session = requests.Session()
        response = session.get(pdf_url, headers=headers, timeout=30, allow_redirects=True)
        
        if response.status_code == 200:
            content = response.content
            content_type = response.headers.get('content-type', '').lower()
            
            print(f"      📋 Content-Type: {content_type}")
            print(f"      📊 Content size: {len(content)} bytes")
            
            # Check if it's a PDF
            is_pdf = (content.startswith(b'%PDF') or 
                     'pdf' in content_type or 
                     pdf_url.lower().endswith('.pdf'))
            
            if is_pdf and len(content) > 1000:  # Reasonable PDF size
                # Create filename
                clean_title = clean_filename(title, max_length=60)
                year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
                filename = f"{year_str}_{clean_title}.pdf"
                filepath = os.path.join(download_folder, filename)
                
                # Handle duplicates
                counter = 1
                base_filepath = filepath
                while os.path.exists(filepath):
                    name, ext = os.path.splitext(base_filepath)
                    filepath = f"{name}_{counter}{ext}"
                    counter += 1
                
                # Save file
                with open(filepath, 'wb') as f:
                    f.write(content)
                
                file_size = len(content) / 1024  # KB
                print(f"      ✅ Downloaded: {os.path.basename(filepath)} ({file_size:.0f} KB)")
                return True, os.path.basename(filepath), f"Downloaded from {urlparse(pdf_url).netloc}"
            
            elif 'html' in content_type:
                # This might be a landing page, try to extract PDF link
                content_str = content.decode('utf-8', errors='ignore')
                
                # Look for PDF links in the HTML
                import re
                pdf_patterns = [
                    r'href="([^"]*\.pdf[^"]*)"',
                    r'href="([^"]*download[^"]*)"',
                    r'<meta[^>]*content="([^"]*\.pdf[^"]*)"',
                ]
                
                for pattern in pdf_patterns:
                    matches = re.findall(pattern, content_str, re.IGNORECASE)
                    if matches:
                        for match in matches:
                            if match.startswith('http'):
                                pdf_link = match
                            else:
                                from urllib.parse import urljoin
                                pdf_link = urljoin(pdf_url, match)
                            
                            print(f"      🔗 Found PDF link in HTML: {pdf_link}")
                            # Try to download the actual PDF
                            return smart_download_paper(pdf_link, title, year, doi, download_folder)
                
                return False, "", "HTML page found but no direct PDF link"
            else:
                return False, "", f"Content is not a valid PDF (type: {content_type})"
        else:
            return False, "", f"HTTP {response.status_code}"
            
    except Exception as e:
        return False, "", f"Download error: {str(e)[:50]}"

print("🚀 Improved Unpaywall functions loaded!")
print("💡 These functions handle more edge cases and provide better debugging info")

🚀 Improved Unpaywall functions loaded!
💡 These functions handle more edge cases and provide better debugging info


In [17]:
# 🧪 TEST IMPROVED FUNCTIONS vs ORIGINAL

print("🧪 TESTING: Improved vs Original Unpaywall Functions")
print("="*80)

# Test with the same sample papers
print("📋 Testing with sample papers from your dataset...")

sample_papers = papers_with_doi.head(3)  # Test with first 3 papers

for i, (_, row) in enumerate(sample_papers.iterrows(), 1):
    doi = row['doi']
    title = row.get('title', 'Unknown Title')
    
    print(f"\n[{i}/3] Testing: {title[:50]}...")
    print(f"DOI: {doi}")
    print("-" * 40)
    
    # Test original function
    print("🔹 ORIGINAL FUNCTION:")
    original_result = check_unpaywall(doi)
    print(f"   Is OA: {original_result.get('is_oa', False)}")
    print(f"   PDF URL: {original_result.get('pdf_url', 'None')}")
    print(f"   Error: {original_result.get('error', 'None')}")
    
    # Test improved function
    print("🔸 IMPROVED FUNCTION:")
    improved_result = check_unpaywall_improved(doi)
    print(f"   Is OA: {improved_result.get('is_oa', False)}")
    print(f"   PDF URL: {improved_result.get('pdf_url', 'None')}")
    print(f"   Total URLs found: {len(improved_result.get('all_urls', []))}")
    print(f"   Debug info: {improved_result.get('debug_info', {})}")
    
    # Show all URLs if found
    all_urls = improved_result.get('all_urls', [])
    if all_urls:
        print("   📎 All available URLs:")
        for j, url_info in enumerate(all_urls, 1):
            print(f"      {j}. {url_info['type'].upper()}: {url_info['url'][:60]}...")
            print(f"         Host: {url_info['host']}")
    
    time.sleep(1)  # Be respectful to API

print(f"\n💡 MANUAL TEST INSTRUCTIONS:")
print("1. Go to https://unpaywall.org/")
print("2. Search for a DOI from your dataset")
print("3. If you find an open access version, copy that DOI")
print("4. Replace the test_doi in the debug cell above")
print("5. Run the debug cell to see the full API response")
print()
print("🎯 COMMON ISSUES AND SOLUTIONS:")
print("• API shows 'is_oa': false but Unpaywall website shows OA")
print("  → The improved function tries to find alternative URLs")
print("• PDF URL leads to a landing page instead of direct PDF")
print("  → The smart download function attempts to extract the real PDF link")
print("• Some papers are behind publisher login walls")
print("  → These appear OA but require institutional access")

🧪 TESTING: Improved vs Original Unpaywall Functions
📋 Testing with sample papers from your dataset...

[1/3] Testing: 1G laboratory-scale shaking table tests on reducti...
DOI: 10.1007/978-981-15-2184-3_82
----------------------------------------
🔹 ORIGINAL FUNCTION:
   Is OA: False
   PDF URL: None
   Error: None
🔸 IMPROVED FUNCTION:
   Is OA: False
   PDF URL: None
   Error: None
🔸 IMPROVED FUNCTION:
   Is OA: False
   PDF URL: None
   Total URLs found: 0
   Debug info: {'total_oa_locations': 0, 'has_best_location': False, 'is_oa_flag': False}
   Is OA: False
   PDF URL: None
   Total URLs found: 0
   Debug info: {'total_oa_locations': 0, 'has_best_location': False, 'is_oa_flag': False}

[2/3] Testing: A Case Study on Buckling Stability of Piles in Liq...
DOI: 10.1007/978-3-030-34252-4_8
----------------------------------------
🔹 ORIGINAL FUNCTION:

[2/3] Testing: A Case Study on Buckling Stability of Piles in Liq...
DOI: 10.1007/978-3-030-34252-4_8
----------------------------------

In [8]:
# 🎯 QUICK TEST with Known Open Access Papers

print("🎯 TESTING with known Open Access DOIs...")
print("="*80)

# These are known open access papers for testing
known_oa_dois = [
    "10.1371/journal.pone.0200837",  # PLOS ONE paper (definitely open access)
    "10.3390/su12208691",           # MDPI Sustainability (open access)
    "10.1038/s41598-020-65994-5"    # Nature Scientific Reports (open access)
]

for i, test_doi in enumerate(known_oa_dois, 1):
    print(f"\n[{i}/{len(known_oa_dois)}] Testing known OA DOI: {test_doi}")
    print("-" * 50)
    
    # Test with improved function
    result = check_unpaywall_improved(test_doi)
    
    print(f"✅ Is Open Access: {result.get('is_oa', False)}")
    print(f"📄 PDF URL: {result.get('pdf_url', 'Not found')}")
    print(f"🌐 Host Type: {result.get('host_type', 'Unknown')}")
    print(f"📊 Total URLs found: {len(result.get('all_urls', []))}")
    
    if result.get('all_urls'):
        print("📎 Available URLs:")
        for url_info in result['all_urls']:
            print(f"   • {url_info['type'].upper()}: {url_info['host']} - {url_info['url'][:70]}...")
    
    if result.get('error'):
        print(f"❌ Error: {result['error']}")
    
    time.sleep(1)

print(f"\n💡 EXPLANATION OF COMMON ISSUES:")
print("="*80)
print("🔍 Why papers might show on Unpaywall website but not via API:")
print()
print("1. 📅 TIMING DIFFERENCES:")
print("   • Website data might be newer than API data")
print("   • Database updates can have delays")
print()
print("2. 🔍 SEARCH vs DIRECT ACCESS:")
print("   • Website search uses fuzzy matching")
print("   • API requires exact DOI match")
print()
print("3. 🚪 INSTITUTIONAL ACCESS:")
print("   • Some papers appear 'open' but need university login")
print("   • Publisher paywall might allow limited access")
print()
print("4. 🔗 LINK TYPES:")
print("   • Some URLs go to landing pages, not direct PDFs")
print("   • Repository links might need additional navigation")
print()
print("📋 NEXT STEPS:")
print("1. Copy a specific DOI from Unpaywall website")
print("2. Test it with the debug function above")
print("3. Check if it's truly open access or requires institutional access")
print("4. Use the improved functions for better detection")

🎯 TESTING with known Open Access DOIs...

[1/3] Testing known OA DOI: 10.1371/journal.pone.0200837
--------------------------------------------------
✅ Is Open Access: True
📄 PDF URL: https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0200837&type=printable
🌐 Host Type: publisher
📊 Total URLs found: 3
📎 Available URLs:
   • PDF: publisher - https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone...
   • LANDING: repository - https://figshare.com/articles/dataset/Case_for_omitting_tied_observati...
   • LANDING: repository - https://www.ncbi.nlm.nih.gov/pmc/articles/6057651...

[2/3] Testing known OA DOI: 10.3390/su12208691
--------------------------------------------------
✅ Is Open Access: True
📄 PDF URL: https://www.mdpi.com/2071-1050/12/20/8691/pdf?version=1603185813
🌐 Host Type: publisher
📊 Total URLs found: 2
📎 Available URLs:
   • PDF: publisher - https://www.mdpi.com/2071-1050/12/20/8691/pdf?version=1603185813...
   • LANDING: repository - http

In [18]:
# 🚀 BULK PROCESSING: Check all papers and download open access ones
print("🚀 Starting bulk Unpaywall processing...")
print("="*80)

# Configuration
DOWNLOAD_FOLDER = "unpaywall_papers"
EMAIL = "researcher@university.edu"  # Change this to your email
BATCH_SIZE = 50  # Process papers in batches
DELAY_BETWEEN_REQUESTS = 1  # Seconds between API calls (be respectful)

# Create download folder
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)
    print(f"📁 Created download folder: {DOWNLOAD_FOLDER}")

# Statistics tracking
total_papers = len(papers_with_doi)
processed = 0
open_access_found = 0
successfully_downloaded = 0
errors = 0

print(f"📊 Processing {total_papers} papers...")
print(f"⏱️  Estimated time: {(total_papers * DELAY_BETWEEN_REQUESTS / 60):.1f} minutes")
print()

# Process papers in batches
for batch_start in range(0, total_papers, BATCH_SIZE):
    batch_end = min(batch_start + BATCH_SIZE, total_papers)
    batch_papers = papers_with_doi.iloc[batch_start:batch_end]
    
    print(f"📦 Processing batch {batch_start//BATCH_SIZE + 1}/{(total_papers-1)//BATCH_SIZE + 1} (papers {batch_start+1}-{batch_end})")
    
    for idx, (_, row) in enumerate(batch_papers.iterrows()):
        processed += 1
        doi = row['doi']
        title = row.get('title', 'Unknown Title')
        year = row.get('year', 'Unknown')
        
        # Progress indicator
        if processed % 25 == 0 or processed == total_papers:
            print(f"   📈 Progress: {processed}/{total_papers} ({processed/total_papers*100:.1f}%)")
        
        try:
            # Check Unpaywall
            oa_info = check_unpaywall(doi, EMAIL)
            
            if oa_info and oa_info.get('is_oa') and oa_info.get('pdf_url'):
                open_access_found += 1
                papers_with_doi.loc[papers_with_doi['doi'] == doi, 'is_open_access'] = True
                papers_with_doi.loc[papers_with_doi['doi'] == doi, 'oa_pdf_url'] = oa_info['pdf_url']
                papers_with_doi.loc[papers_with_doi['doi'] == doi, 'oa_host_type'] = oa_info.get('host_type', '')
                papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_status'] = "Open access found"
                
                print(f"   ✅ Found OA paper: {title[:40]}...")
                print(f"      🌐 Host: {oa_info.get('host_type', 'Unknown')}")
                
                # Try to download
                success, filename, status = download_oa_paper(
                    oa_info['pdf_url'], title, year, doi, DOWNLOAD_FOLDER
                )
                
                if success:
                    successfully_downloaded += 1
                    papers_with_doi.loc[papers_with_doi['doi'] == doi, 'downloaded'] = True
                    papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_filename'] = filename
                    papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_status'] = f"Downloaded successfully - {status}"
                else:
                    papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_status'] = f"Download failed - {status}"
                    
            else:
                # Not open access or error
                error_msg = oa_info.get('error', 'Not open access') if oa_info else 'API error'
                papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_status'] = error_msg
                
        except Exception as e:
            errors += 1
            papers_with_doi.loc[papers_with_doi['doi'] == doi, 'download_status'] = f"Error: {str(e)[:50]}"
            print(f"   ❌ Error processing {doi}: {str(e)[:50]}")
        
        # Be respectful to the API
        time.sleep(DELAY_BETWEEN_REQUESTS)
    
    # Save progress after each batch
    papers_with_doi.to_csv("unpaywall_download_status.csv", index=False)
    print(f"   💾 Progress saved to unpaywall_download_status.csv")

print()
print("🎉 BULK PROCESSING COMPLETED!")
print("="*80)
print(f"📊 FINAL STATISTICS:")
print(f"   📚 Total papers processed: {processed}")
print(f"   🔓 Open access papers found: {open_access_found}")
print(f"   📥 Successfully downloaded: {successfully_downloaded}")
print(f"   ❌ Errors encountered: {errors}")
print(f"   📈 Success rate: {(successfully_downloaded/open_access_found*100):.1f}%" if open_access_found > 0 else "   📈 Success rate: N/A")
print()
print(f"📁 Downloaded papers are in: {DOWNLOAD_FOLDER}/")
print(f"📊 Full results saved to: unpaywall_download_status.csv")

# Display some successful downloads
if successfully_downloaded > 0:
    successful_papers = papers_with_doi[papers_with_doi['downloaded'] == True]
    print(f"\n🎯 SUCCESSFULLY DOWNLOADED PAPERS:")
    for _, paper in successful_papers.iterrows():
        print(f"   📄 {paper['download_filename']}")
        print(f"      📚 Title: {paper.get('title', 'Unknown')[:60]}...")
        print(f"      🌐 Host: {paper['oa_host_type']}")
        print()

🚀 Starting bulk Unpaywall processing...
📁 Created download folder: unpaywall_papers
📊 Processing 250 papers...
⏱️  Estimated time: 4.2 minutes

📦 Processing batch 1/5 (papers 1-50)
   📈 Progress: 25/250 (10.0%)
   📈 Progress: 25/250 (10.0%)
   ✅ Found OA paper: Behavior of group piles under combined l...
      🌐 Host: publisher
      📥 Downloading from: www.degruyter.com
   ✅ Found OA paper: Behavior of group piles under combined l...
      🌐 Host: publisher
      📥 Downloading from: www.degruyter.com
      ✅ Downloaded: 2022_Behavior_of_group_piles_under_combined_loadings_after_improv.pdf (5187 KB)
      ✅ Downloaded: 2022_Behavior_of_group_piles_under_combined_loadings_after_improv.pdf (5187 KB)
   ✅ Found OA paper: Behavior of pile foundations in laterall...
      🌐 Host: repository
      📥 Downloading from: escholarship.org
   ✅ Found OA paper: Behavior of pile foundations in laterall...
      🌐 Host: repository
      📥 Downloading from: escholarship.org
      ✅ Downloaded: 2005_Be

In [19]:
# 📊 RESULTS ANALYSIS AND SUMMARY

# Load results if needed
if 'papers_with_doi' not in locals():
    papers_with_doi = pd.read_csv("unpaywall_download_status.csv")

print("📊 UNPAYWALL DOWNLOAD RESULTS ANALYSIS")
print("="*80)

# Basic statistics
total_papers = len(papers_with_doi)
papers_checked = papers_with_doi['download_status'].notna().sum()
oa_papers = papers_with_doi['is_open_access'].sum()
downloaded_papers = papers_with_doi['downloaded'].sum()

print(f"📚 Dataset Overview:")
print(f"   • Total papers with DOI: {total_papers}")
print(f"   • Papers checked via Unpaywall: {papers_checked}")
print(f"   • Open access papers found: {oa_papers}")
print(f"   • Successfully downloaded: {downloaded_papers}")
print()

# Download status breakdown
print(f"📈 Download Status Breakdown:")
status_counts = papers_with_doi['download_status'].value_counts()
for status, count in status_counts.head(10).items():
    percentage = (count / total_papers) * 100
    print(f"   • {status}: {count} ({percentage:.1f}%)")
print()

# Open access host types
if oa_papers > 0:
    print(f"🌐 Open Access Host Types:")
    oa_papers_subset = papers_with_doi[papers_with_doi['is_open_access'] == True]
    host_counts = oa_papers_subset['oa_host_type'].value_counts()
    for host, count in host_counts.items():
        if host:  # Only show non-empty hosts
            print(f"   • {host}: {count} papers")
    print()

# Year distribution of downloaded papers
if downloaded_papers > 0:
    print(f"📅 Year Distribution of Downloaded Papers:")
    downloaded_subset = papers_with_doi[papers_with_doi['downloaded'] == True]
    if 'year' in downloaded_subset.columns:
        year_counts = downloaded_subset['year'].value_counts().sort_index()
        for year, count in year_counts.head(10).items():
            if pd.notna(year):
                print(f"   • {int(year)}: {count} papers")
    print()

# Success rate analysis
if oa_papers > 0:
    success_rate = (downloaded_papers / oa_papers) * 100
    print(f"🎯 Success Metrics:")
    print(f"   • Open access availability: {(oa_papers/total_papers)*100:.1f}%")
    print(f"   • Download success rate: {success_rate:.1f}%")
    print(f"   • Overall download rate: {(downloaded_papers/total_papers)*100:.1f}%")
else:
    print(f"🎯 Success Metrics:")
    print(f"   • Open access availability: 0.0%")
    print(f"   • No open access papers found for download")

print()
print("📁 Files Generated:")
print(f"   • unpaywall_download_status.csv - Complete results with download status")
print(f"   • unpaywall_papers/ - Folder containing downloaded PDFs")

print()
print("💡 Next Steps:")
print("   1. Check the 'unpaywall_papers' folder for your downloaded PDFs")
print("   2. Review 'unpaywall_download_status.csv' for detailed results")
print("   3. For papers not available via Unpaywall, consider:")
print("      - Checking your institutional access")
print("      - Contacting authors for preprints")
print("      - Using other legal databases (PubMed Central, arXiv, etc.)")

print()
print("✅ Unpaywall processing complete! All downloads are 100% legal and ethical.")

# Display sample of downloaded papers
if downloaded_papers > 0:
    print("\n📄 Sample of Downloaded Papers:")
    sample_papers = papers_with_doi[papers_with_doi['downloaded'] == True].head(5)
    for i, (_, paper) in enumerate(sample_papers.iterrows(), 1):
        title = paper.get('title', 'Unknown Title')
        filename = paper.get('download_filename', 'Unknown')
        host = paper.get('oa_host_type', 'Unknown')
        print(f"   {i}. {filename}")
        print(f"      Title: {title[:70]}...")
        print(f"      Source: {host}")
        print()

📊 UNPAYWALL DOWNLOAD RESULTS ANALYSIS
📚 Dataset Overview:
   • Total papers with DOI: 250
   • Papers checked via Unpaywall: 250
   • Open access papers found: 49
   • Successfully downloaded: 37

📈 Download Status Breakdown:
   • Not open access: 192 (76.8%)
   • Downloaded successfully - Downloaded from www.jstage.jst.go.jp: 13 (5.2%)
   • DOI not found: 9 (3.6%)
   • Download failed - URL did not return a valid PDF: 7 (2.8%)
   • Downloaded successfully - Downloaded from www.mdpi.com: 5 (2.0%)
   • Download failed - HTTP 403: 4 (1.6%)
   • Downloaded successfully - Downloaded from www.e3s-conferences.org: 2 (0.8%)
   • Downloaded successfully - Downloaded from www.matec-conferences.org: 2 (0.8%)
   • Downloaded successfully - Downloaded from www.degruyter.com: 2 (0.8%)
   • Downloaded successfully - Downloaded from scientiairanica.sharif.edu: 2 (0.8%)

🌐 Open Access Host Types:
   • publisher: 46 papers
   • repository: 3 papers

📅 Year Distribution of Downloaded Papers:
   • 2000: 