# 📚 Download Papers Published Before 2022

This notebook:
1. Loads and cleans the dataset
2. Filters papers published before 2022
3. Uses Sci-Hub method to download all filtered papers
4. Tracks download status and saves results

In [2]:
# Import required libraries
import pandas as pd
import requests
import time
import os
import re
from pathlib import Path

print("📦 Libraries imported successfully!")

📦 Libraries imported successfully!


In [3]:
# Load the cleaned dataset
print("📊 Loading dataset...")
data = pd.read_csv("All details_cleaned.csv")

print(f"Dataset loaded successfully!")
print(f"Total papers: {len(data)}")
print(f"Columns: {list(data.columns)}")

# Show basic info
print("\n📈 Basic dataset info:")
print(f"Papers with DOI: {data['doi'].notna().sum()}")
print(f"Papers with year info: {data['year'].notna().sum()}")
print(f"Year range: {data['year'].min()} - {data['year'].max()}")

📊 Loading dataset...
Dataset loaded successfully!
Total papers: 357
Columns: ['key', 'title', 'year', 'month', 'day', 'journal', 'issn', 'volume', 'issue', 'pages', 'authors', 'url', 'language', 'publisher', 'location', 'abstract', 'notes', 'doi', 'keywords', 'pubmed_id', 'pmc_id', 'PDF files']

📈 Basic dataset info:
Papers with DOI: 251
Papers with year info: 356
Year range: 2000.0 - 2025.0


In [4]:
# Step 1: Filter papers published before 2022
print("🔍 Filtering papers published before 2022...")

# Convert year to numeric and handle any non-numeric values
data['year'] = pd.to_numeric(data['year'], errors='coerce')

# Filter papers before 2022
papers_before_2022 = data[data['year'] < 2022].copy()
print(f"Papers published before 2022: {len(papers_before_2022)}")

# Further filter for papers with valid DOIs
papers_with_doi = papers_before_2022[
    papers_before_2022['doi'].notna() & 
    papers_before_2022['doi'].str.startswith('10.', na=False)
].copy()

print(f"Papers before 2022 with valid DOIs: {len(papers_with_doi)}")

# Show year distribution
year_counts = papers_with_doi['year'].value_counts().sort_index()
print("\n📅 Papers by year:")
for year, count in year_counts.head(10).items():
    print(f"  {int(year)}: {count} papers")
if len(year_counts) > 10:
    print(f"  ... and {len(year_counts) - 10} more years")

# Add download tracking columns
papers_with_doi['downloaded'] = False
papers_with_doi['download_filename'] = ""
papers_with_doi['download_status'] = "Not attempted"

print(f"\n✅ Dataset ready for download: {len(papers_with_doi)} papers")

🔍 Filtering papers published before 2022...
Papers published before 2022: 286
Papers before 2022 with valid DOIs: 182

📅 Papers by year:
  2000: 4 papers
  2003: 2 papers
  2004: 3 papers
  2005: 5 papers
  2006: 6 papers
  2007: 3 papers
  2008: 9 papers
  2009: 9 papers
  2010: 6 papers
  2011: 9 papers
  ... and 10 more years

✅ Dataset ready for download: 182 papers


In [5]:
# Helper functions for downloading

def clean_filename(title, max_length=80):
    """Clean paper title to create a valid filename"""
    if pd.isna(title):
        return "Unknown_Title"
    
    # Remove invalid characters for Windows filenames
    title = str(title)
    title = re.sub(r'[<>:"/\\|?*]', '', title)
    title = re.sub(r'[^\w\s\-.]', '', title)
    title = re.sub(r'\s+', '_', title.strip())
    
    # Limit length
    if len(title) > max_length:
        title = title[:max_length]
    
    return title if title else "Unknown_Title"

def download_from_scihub(doi, max_retries=2):
    """Download paper from Sci-Hub with advanced browser check handling"""
    scihub_urls = [
        "https://sci-hub.se/",
        "https://sci-hub.st/", 
        "https://sci-hub.ru/",
        "https://sci-hub.wf/",
        "https://sci-hub.ren/"
    ]
    
    # Enhanced headers to mimic a real browser more closely
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0'
    }
    
    for base_url in scihub_urls:
        session = requests.Session()
        session.headers.update(headers)
        
        for attempt in range(max_retries):
            try:
                url = f"{base_url}{doi}"
                print(f"      Trying: {base_url} (attempt {attempt + 1})")
                
                # First request with longer timeout
                response = session.get(url, timeout=45, allow_redirects=True)
                content_type = response.headers.get('content-type', '').lower()
                content_length = len(response.content)
                
                print(f"      📊 Response: {response.status_code}, Content-Type: {content_type}, Size: {content_length}")
                
                # Check if we got a PDF directly
                is_pdf = (
                    'pdf' in content_type or 
                    response.content.startswith(b'%PDF') or
                    (content_length > 100000 and 'pdf' in response.url.lower())
                )
                
                if is_pdf:
                    print(f"      ✅ Found PDF directly ({content_length} bytes)")
                    return response.content
                
                # If we got HTML, analyze it more carefully
                elif 'text/html' in content_type and response.status_code == 200:
                    html_content = response.text.lower()
                    
                    # Check for browser check indicators
                    browser_check_indicators = [
                        'checking your browser',
                        'please wait',
                        'cloudflare',
                        'ddos protection',
                        'security check',
                        'loading...'
                    ]
                    
                    is_browser_check = any(indicator in html_content for indicator in browser_check_indicators)
                    
                    if is_browser_check and content_length < 5000:
                        print(f"      🔄 Detected browser check page, waiting longer...")
                        time.sleep(10)  # Wait longer for browser check
                        
                        # Try the same URL again
                        response2 = session.get(url, timeout=45, allow_redirects=True)
                        content_type2 = response2.headers.get('content-type', '').lower()
                        content_length2 = len(response2.content)
                        
                        print(f"      🔄 After wait: {response2.status_code}, Content-Type: {content_type2}, Size: {content_length2}")
                        
                        # Check if second attempt got PDF
                        is_pdf2 = (
                            'pdf' in content_type2 or 
                            response2.content.startswith(b'%PDF') or
                            (content_length2 > 100000)
                        )
                        
                        if is_pdf2:
                            print(f"      ✅ Found PDF after wait ({content_length2} bytes)")
                            return response2.content
                        
                        html_content = response2.text
                    
                    # Look for PDF download links with more comprehensive patterns
                    print(f"      🔍 Searching for PDF links in HTML...")
                    pdf_patterns = [
                        r'href=["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'src=["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'window\.location\s*=\s*["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'location\.href\s*=\s*["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'download["\']?\s*:\s*["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'url\s*:\s*["\']([^"\']*\.pdf[^"\']*)["\']',
                        r'(https?://[^\\s<>"\']*\.pdf[^\\s<>"\']*)'
                    ]
                    
                    pdf_urls = set()
                    for pattern in pdf_patterns:
                        matches = re.findall(pattern, html_content, re.IGNORECASE | re.MULTILINE)
                        for match in matches:
                            if isinstance(match, tuple):
                                match = match[0]
                            if '.pdf' in match.lower():
                                if match.startswith('http'):
                                    pdf_urls.add(match)
                                elif match.startswith('/'):
                                    pdf_urls.add(f"{base_url.rstrip('/')}{match}")
                                else:
                                    pdf_urls.add(f"{base_url.rstrip('/')}/{match}")
                    
                    print(f"      🔗 Found {len(pdf_urls)} potential PDF URLs")
                    
                    # Try each PDF URL
                    for pdf_url in list(pdf_urls)[:3]:  # Limit to first 3 URLs
                        try:
                            print(f"      📥 Trying PDF URL: {pdf_url[:60]}...")
                            pdf_response = session.get(pdf_url, timeout=30)
                            
                            if pdf_response.status_code == 200:
                                pdf_content_type = pdf_response.headers.get('content-type', '').lower()
                                pdf_size = len(pdf_response.content)
                                
                                if ('pdf' in pdf_content_type or 
                                    pdf_response.content.startswith(b'%PDF') or
                                    pdf_size > 50000):
                                    print(f"      ✅ Downloaded PDF from URL ({pdf_size} bytes)")
                                    return pdf_response.content
                                else:
                                    print(f"      ❌ Not a PDF: {pdf_content_type}, {pdf_size} bytes")
                        except Exception as e:
                            print(f"      ❌ PDF URL failed: {str(e)[:50]}")
                            continue
                    
                    print(f"      ❌ No valid PDF found in HTML")
                else:
                    print(f"      ❌ Unexpected response: {response.status_code}")
                    
            except Exception as e:
                print(f"      ❌ Error: {str(e)[:50]}")
                if attempt < max_retries - 1:
                    print(f"      ⏱️  Waiting before retry...")
                    time.sleep(8)
                continue
        
        # Add delay between different Sci-Hub URLs
        time.sleep(3)
    
    return None

def download_paper_scihub(doi, title, year, download_folder="papers_before_2022"):
    """Download a single paper using Sci-Hub only"""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    # Check if paper already exists in folder
    clean_title = clean_filename(title, max_length=60)
    year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
    base_filename = f"{year_str}_{clean_title}.pdf"
    
    # Check for existing files (including duplicates with _1, _2, etc.)
    existing_files = []
    if os.path.exists(download_folder):
        for file in os.listdir(download_folder):
            if file.startswith(f"{year_str}_{clean_title}") and file.endswith('.pdf'):
                existing_files.append(file)
    
    if existing_files:
        # Check if any existing file is valid (size > 1KB)
        for existing_file in existing_files:
            filepath = os.path.join(download_folder, existing_file)
            if os.path.getsize(filepath) > 1000:  # More than 1KB
                print(f"   ⏭️  Already exists: {existing_file} ({os.path.getsize(filepath)} bytes)")
                return True, existing_file, "Already downloaded (skipped)"
    
    print(f"   📥 Downloading: {doi}")
    
    # Strategy 1: Try Selenium (best for browser checks)
    if SELENIUM_AVAILABLE:
        print(f"   🤖 Strategy 1: Using Selenium for browser check handling...")
        content = download_with_selenium_auto(doi)
        if content and len(content) > 1000:
            print(f"   ✅ Selenium succeeded!")
        else:
            content = None
    else:
        content = None
    
    # Strategy 2: Try direct PDF URLs (fast fallback)
    if not content:
        session = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'application/pdf,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
        }
        session.headers.update(headers)
        
        print(f"   🎯 Strategy 2: Trying direct PDF URLs...")
        content = try_direct_pdf_urls(doi, session)
    
    # Strategy 3: Use the main Sci-Hub download function (last resort)
    if not content:
        print(f"   🌐 Strategy 3: Using requests-based method...")
        content = download_from_scihub(doi)
    
    # Save the file if we got content
    if content and len(content) > 1000:  # Minimum size check
        # Create filename from title and year
        clean_title = clean_filename(title, max_length=60)
        year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
        filename = f"{year_str}_{clean_title}.pdf"
        filepath = os.path.join(download_folder, filename)
        
        # Handle duplicate filenames
        counter = 1
        base_filepath = filepath
        while os.path.exists(filepath):
            name, ext = os.path.splitext(base_filepath)
            filepath = f"{name}_{counter}{ext}"
            counter += 1
        
        try:
            with open(filepath, 'wb') as f:
                f.write(content)
            
            print(f"   ✅ Downloaded: {os.path.basename(filepath)} ({len(content)} bytes)")
            return True, os.path.basename(filepath), "Downloaded from Sci-Hub"
        except Exception as e:
            print(f"   ❌ Error saving: {str(e)}")
            return False, "", f"Error saving: {str(e)}"
    
    print(f"   ❌ Download failed - all strategies exhausted")
    return False, "", "Download failed - Sci-Hub unavailable"

print("🔧 Helper functions defined successfully!")

🔧 Helper functions defined successfully!


In [6]:
def try_direct_pdf_urls(doi, session):
    """Try to construct direct PDF URLs based on common Sci-Hub patterns"""
    
    # Common direct PDF URL patterns for Sci-Hub
    direct_patterns = [
        f"https://twin.sci-hub.se/{doi}",
        #f"https://sci-hub.se/downloads/2022-01-01/{doi.replace('/', '_')}.pdf",
        f"https://sci-hub.st/downloads/{doi}",
        #f"https://sci-hub.ru/downloads/{doi}.pdf",
    ]
    
    # Also try with some common PDF hosting patterns
    encoded_doi = doi.replace('/', '%2F').replace(':', '%3A')
    additional_patterns = [
        f"https://sci-hub.ru/pdf/{encoded_doi}",
        f"https://sci-hub.st/pdf/{encoded_doi}.pdf",
        #f"https://twin.sci-hub.se/pdf/{doi}.pdf",
    ]
    
    all_patterns = direct_patterns + additional_patterns
    
    for pdf_url in all_patterns:
        try:
            print(f"      🎯 Trying direct URL: {pdf_url}")
            response = session.get(pdf_url, timeout=20)
            
            if response.status_code == 200:
                content_type = response.headers.get('content-type', '').lower()
                content_size = len(response.content)
                
                if ('pdf' in content_type or 
                    response.content.startswith(b'%PDF') or
                    content_size > 50000):
                    print(f"      ✅ Direct URL success! ({content_size} bytes)")
                    return response.content
                else:
                    print(f"      ❌ Not PDF: {content_type}, {content_size} bytes")
            else:
                print(f"      ❌ Status {response.status_code}")
                
        except Exception as e:
            print(f"      ❌ Direct URL failed: {str(e)[:30]}")
            continue
    
    return None

print("🔧 Additional helper function added!")

🔧 Additional helper function added!


In [7]:
# Install and import Selenium for browser automation
try:
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.common.exceptions import TimeoutException, WebDriverException
    SELENIUM_AVAILABLE = True
    print("✅ Selenium is available")
except ImportError:
    SELENIUM_AVAILABLE = False
    print("❌ Selenium not installed. Run: pip install selenium")
    print("   Also download ChromeDriver from: https://chromedriver.chromium.org/")

def setup_selenium_driver():
    """Setup Chrome driver with appropriate options"""
    if not SELENIUM_AVAILABLE:
        return None
    
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        # Try to use ChromeDriver (you may need to specify path)
        # chrome_options.add_argument("--chromedriver-path=/path/to/chromedriver")  # Uncomment and set path if needed
        
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"❌ Failed to setup Chrome driver: {str(e)}")
        print("💡 Make sure ChromeDriver is installed and in PATH")
        return None

def download_with_selenium(doi, max_wait=30):
    """Download paper using Selenium to handle browser checks"""
    if not SELENIUM_AVAILABLE:
        return None
    
    driver = setup_selenium_driver()
    if not driver:
        return None
    
    scihub_urls = [
        "https://sci-hub.se/",
        "https://sci-hub.st/", 
        "https://sci-hub.ru/",
        "https://sci-hub.wf/"
    ]
    
    for base_url in scihub_urls:
        try:
            url = f"{base_url}{doi}"
            print(f"      🌐 Selenium trying: {base_url}")
            
            driver.get(url)
            
            # Wait for page to load and check for PDF
            wait = WebDriverWait(driver, max_wait)
            
            # Strategy 1: Look for direct PDF in current page
            try:
                # Check if current page is PDF
                if "application/pdf" in driver.execute_script("return document.contentType") or driver.current_url.endswith('.pdf'):
                    print(f"      ✅ Found direct PDF page")
                    # Get PDF content
                    pdf_content = requests.get(driver.current_url).content
                    driver.quit()
                    return pdf_content
            except:
                pass
            
            # Strategy 2: Wait for and click download links
            pdf_link_selectors = [
                "a[href*='.pdf']",
                "a[onclick*='pdf']", 
                "#pdf-link",
                ".download-link",
                "a[href*='download']"
            ]
            
            for selector in pdf_link_selectors:
                try:
                    # Wait for element to be clickable
                    element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                    pdf_url = element.get_attribute('href')
                    
                    if pdf_url and '.pdf' in pdf_url.lower():
                        print(f"      📥 Found PDF link: {pdf_url[:50]}...")
                        
                        # Download the PDF
                        pdf_response = requests.get(pdf_url, timeout=30)
                        if pdf_response.status_code == 200 and len(pdf_response.content) > 1000:
                            print(f"      ✅ Selenium download success! ({len(pdf_response.content)} bytes)")
                            driver.quit()
                            return pdf_response.content
                            
                except TimeoutException:
                    continue
                except Exception as e:
                    print(f"      ❌ Selector {selector} failed: {str(e)[:30]}")
                    continue
            
            # Strategy 3: Wait for page to change/redirect after browser check
            time.sleep(10)  # Wait for potential redirects
            
            # Check if URL changed (redirect after browser check)
            if driver.current_url != url:
                print(f"      🔄 Page redirected to: {driver.current_url[:50]}...")
                if driver.current_url.endswith('.pdf'):
                    pdf_content = requests.get(driver.current_url).content
                    driver.quit()
                    return pdf_content
            
            print(f"      ❌ No PDF found with Selenium")
            
        except Exception as e:
            print(f"      ❌ Selenium error: {str(e)[:50]}")
            continue
    
    driver.quit()
    return None

print("🤖 Selenium helper functions added!")

✅ Selenium is available
🤖 Selenium helper functions added!


## 🤖 Selenium Setup (Recommended for Best Results)

**Why Selenium?** It handles browser checks perfectly by actually running JavaScript and waiting for page loads.

### Installation Steps:
1. **Install Selenium**: `pip install selenium`
2. **Download ChromeDriver**: 
   - Go to: https://chromedriver.chromium.org/
   - Download version matching your Chrome browser
   - Add to PATH or specify path in code
3. **Alternative**: Use `webdriver-manager` for automatic setup: `pip install webdriver-manager`

### If Selenium fails:
- System will automatically fall back to direct URL attempts
- Then to requests-based method as final fallback
- No manual intervention needed!

In [8]:
# Alternative: Auto-install ChromeDriver using webdriver-manager
try:
    from webdriver_manager.chrome import ChromeDriverManager
    WEBDRIVER_MANAGER_AVAILABLE = True
    print("✅ webdriver-manager is available (easier setup)")
except ImportError:
    WEBDRIVER_MANAGER_AVAILABLE = False
    print("💡 Optional: Install webdriver-manager for automatic ChromeDriver setup")
    print("   Command: pip install webdriver-manager")

def setup_selenium_driver_auto():
    """Setup Chrome driver with automatic driver management"""
    if not SELENIUM_AVAILABLE:
        return None
    
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        
        if WEBDRIVER_MANAGER_AVAILABLE:
            # Automatically download and setup ChromeDriver
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            print("🤖 Using automatic ChromeDriver setup")
        else:
            # Try system ChromeDriver
            driver = webdriver.Chrome(options=chrome_options)
            print("🤖 Using system ChromeDriver")
        
        driver.set_page_load_timeout(60)
        return driver
        
    except Exception as e:
        print(f"❌ Auto-setup failed: {str(e)}")
        print("💡 Try manual ChromeDriver installation")
        return None

# Update the download function to use auto-setup
def download_with_selenium_auto(doi, max_wait=30):
    """Download using Selenium with automatic driver setup"""
    if not SELENIUM_AVAILABLE:
        return None
    
    driver = setup_selenium_driver_auto()
    if not driver:
        # Fallback to manual setup
        driver = setup_selenium_driver()
        if not driver:
            return None
    
    scihub_urls = [
        "https://sci-hub.se/",
        "https://sci-hub.st/", 
        "https://sci-hub.ru/" ,
        
    ]
    
    for base_url in scihub_urls:
        try:
            url = f"{base_url}{doi}"
            print(f"      🌐 Selenium: {base_url}")
            
            driver.get(url)
            time.sleep(5)  # Wait for initial load
            
            # Check if we're redirected to PDF
            current_url = driver.current_url
            if current_url.endswith('.pdf') or 'pdf' in current_url.lower():
                print(f"      ✅ Direct PDF redirect")
                pdf_content = requests.get(current_url).content
                driver.quit()
                return pdf_content
            
            # Look for PDF download links
            pdf_selectors = [
                "a[href*='.pdf']",
                "a[href*='download']",
                ".pdf-download",
                "#pdf",
                "embed[src*='.pdf']"
            ]
            
            for selector in pdf_selectors:
                try:
                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    for element in elements:
                        href = element.get_attribute('href') or element.get_attribute('src')
                        if href and '.pdf' in href:
                            print(f"      📥 Found PDF: {href[:40]}...")
                            pdf_response = requests.get(href, timeout=30)
                            if pdf_response.status_code == 200 and len(pdf_response.content) > 1000:
                                driver.quit()
                                return pdf_response.content
                except:
                    continue
            
            # Wait longer for potential redirects
            time.sleep(10)
            if driver.current_url != url and driver.current_url.endswith('.pdf'):
                pdf_content = requests.get(driver.current_url).content
                driver.quit()
                return pdf_content
                
        except Exception as e:
            print(f"      ❌ Selenium error: {str(e)[:40]}")
            continue
    
    driver.quit()
    return None

print("🚀 Enhanced Selenium setup ready!")

✅ webdriver-manager is available (easier setup)
🚀 Enhanced Selenium setup ready!


In [9]:
# Main bulk download function
def bulk_download_scihub(papers_df, download_folder="papers_before_2022", delay=6, max_papers=None, resume=True):
    """
    Bulk download papers using Sci-Hub only
    
    Parameters:
    - papers_df: DataFrame with papers to download
    - download_folder: Folder to save papers
    - delay: Delay between downloads (seconds)
    - max_papers: Maximum papers to download (None for all)
    - resume: If True, skip papers that already exist in the folder
    """
    
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    # Limit papers if specified
    if max_papers:
        papers_to_download = papers_df.head(max_papers).copy()
        print(f"⚠️  Limited to first {max_papers} papers for testing")
    else:
        papers_to_download = papers_df.copy()
    
    # Check for existing downloads if resume mode
    if resume:
        existing_count = 0
        existing_files = os.listdir(download_folder) if os.path.exists(download_folder) else []
        existing_valid_files = [f for f in existing_files if f.endswith('.pdf') and 
                               os.path.getsize(os.path.join(download_folder, f)) > 1000]
        existing_count = len(existing_valid_files)
        
        if existing_count > 0:
            print(f"📁 Found {existing_count} existing valid PDF files in {download_folder}")
            print(f"🔄 Resume mode: Will skip papers that already exist")
    
    print(f"🚀 Starting Sci-Hub bulk download...")
    print(f"📁 Download folder: {download_folder}")
    print(f"📚 Papers to download: {len(papers_to_download)}")
    print(f"⏱️  Delay between downloads: {delay} seconds")
    print("="*80)
    
    successful_downloads = 0
    failed_downloads = 0
    skipped_downloads = 0
    
    for index, (idx, row) in enumerate(papers_to_download.iterrows(), 1):
        doi = row['doi']
        title = row.get('title', f"Paper_{index}")
        year = row.get('year', 'Unknown')
        
        print(f"\n[{index}/{len(papers_to_download)}] {title[:50]}...")
        print(f"   DOI: {doi} | Year: {year}")
        
        # Attempt download
        success, filename, status = download_paper_scihub(doi, title, year, download_folder)
        
        # Update the dataset
        papers_df.loc[idx, 'downloaded'] = success
        papers_df.loc[idx, 'download_filename'] = filename
        papers_df.loc[idx, 'download_status'] = status
        
        if success:
            if "skipped" in status.lower() or "already" in status.lower():
                skipped_downloads += 1
            else:
                successful_downloads += 1
        else:
            failed_downloads += 1
        
        # Progress update
        total_processed = successful_downloads + failed_downloads + skipped_downloads
        print(f"   📊 Progress: {successful_downloads} ✅ | {skipped_downloads} ⏭️ | {failed_downloads} ❌")
        print(f"   📈 Overall: {total_processed}/{len(papers_to_download)} ({total_processed/len(papers_to_download)*100:.1f}%)")
        
        # Save progress every 10 papers
        if index % 10 == 0:
            temp_filename = f"{download_folder}_progress.csv"
            papers_df.to_csv(temp_filename, index=False)
            print(f"   💾 Progress saved to {temp_filename}")
        
        # Add delay between downloads (be respectful to servers)
        if index < len(papers_to_download):
            if not ("skipped" in status.lower() or "already" in status.lower()):
                print(f"   ⏱️  Waiting {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"   ⏱️  Skipped - no delay needed")
    
    print("\n" + "="*80)
    print(f"🎉 Bulk download completed!")
    print(f"✅ Successful downloads: {successful_downloads}")
    print(f"⏭️  Skipped (already exist): {skipped_downloads}")
    print(f"❌ Failed downloads: {failed_downloads}")
    total_attempted = successful_downloads + failed_downloads
    if total_attempted > 0:
        print(f"📊 Success rate (new downloads): {(successful_downloads/total_attempted*100):.1f}%")
    print(f"📊 Total completion rate: {((successful_downloads + skipped_downloads)/(successful_downloads + failed_downloads + skipped_downloads)*100):.1f}%")
    
    return papers_df

print("🔧 Bulk download function ready!")

🔧 Bulk download function ready!


In [10]:
# Test with a small batch first (now with resume capability)
print("🧪 Testing with first 182 papers...")
print("⚠️  Change max_papers to None to download all papers")
print("✅ Resume mode enabled - will skip already downloaded papers")

# Check existing downloads first
download_folder = "papers_before_2022"
if os.path.exists(download_folder):
    existing_files = [f for f in os.listdir(download_folder) if f.endswith('.pdf')]
    valid_files = []
    for f in existing_files:
        filepath = os.path.join(download_folder, f)
        if os.path.getsize(filepath) > 1000:  # Valid size
            valid_files.append(f)
    
    print(f"📁 Found {len(valid_files)} existing valid PDF files")
    if len(valid_files) > 0:
        print("   Sample existing files:")
        for f in valid_files[:3]:
            size_kb = os.path.getsize(os.path.join(download_folder, f)) / 1024
            print(f"     📄 {f} ({size_kb:.0f} KB)")
        if len(valid_files) > 3:
            print(f"     ... and {len(valid_files) - 3} more files")
else:
    print("📁 No existing download folder found")

print("\n" + "="*50)

# Run the download
papers_with_downloads = bulk_download_scihub(
    papers_with_doi,
    download_folder="papers_before_2022",
    delay=2,  # Increased delay for browser checks
    max_papers=182,  # Test with 182 papers first
    resume=True  # Enable resume mode
)

🧪 Testing with first 182 papers...
⚠️  Change max_papers to None to download all papers
✅ Resume mode enabled - will skip already downloaded papers
📁 Found 8 existing valid PDF files
   Sample existing files:
     📄 2008_A_numerical_study_into_lateral_cyclic_nonlinear_soil-pile_re.pdf (1715 KB)
     📄 2009_A_comparison_study_of_engineering_approaches_for_seismic_eva.pdf (759 KB)
     📄 2019_Applicability_of_the_generalized_scaling_law_to_a_pile-incli.pdf (8946 KB)
     ... and 5 more files

⚠️  Limited to first 182 papers for testing
📁 Found 8 existing valid PDF files in papers_before_2022
🔄 Resume mode: Will skip papers that already exist
🚀 Starting Sci-Hub bulk download...
📁 Download folder: papers_before_2022
📚 Papers to download: 182
⏱️  Delay between downloads: 2 seconds

[1/182] 1G laboratory-scale shaking table tests on reducti...
   DOI: 10.1007/978-981-15-2184-3_82 | Year: 2020.0
   ⏭️  Already exists: 2020_1G_laboratory-scale_shaking_table_tests_on_reduction_of_liqu.pdf (1967

In [11]:
# Save results and show summary
print("💾 Saving results...")

# Save the updated dataset
papers_with_downloads.to_csv("papers_before_2022_download_status.csv", index=False)
print("✅ Dataset saved as 'papers_before_2022_download_status.csv'")

# Show download statistics
downloaded_count = papers_with_downloads['downloaded'].sum()
total_papers = len(papers_with_downloads)

print(f"\n📊 Final Statistics:")
print(f"   📚 Total papers before 2022: {total_papers}")
print(f"   ✅ Successfully downloaded: {downloaded_count}")
print(f"   ❌ Failed downloads: {total_papers - downloaded_count}")
print(f"   📈 Success rate: {(downloaded_count/total_papers*100):.1f}%")

# Show downloaded papers
downloaded_papers = papers_with_downloads[papers_with_downloads['downloaded'] == True]
if len(downloaded_papers) > 0:
    print(f"\n✅ Successfully Downloaded Papers:")
    print("-" * 80)
    for i, (_, row) in enumerate(downloaded_papers.head(10).iterrows(), 1):
        title = row.get('title', 'Unknown Title')
        year = row.get('year', 'Unknown')
        filename = row['download_filename']
        print(f"{i}. [{int(year)}] {title[:50]}...")
        print(f"   File: {filename}")
        print()

# Show folder contents
download_folder = "papers_before_2022"
if os.path.exists(download_folder):
    files = os.listdir(download_folder)
    print(f"\n📁 Files in '{download_folder}' folder: {len(files)}")
    for file in files[:5]:
        file_size = os.path.getsize(os.path.join(download_folder, file)) / 1024  # KB
        print(f"   📄 {file} ({file_size:.0f} KB)")
    if len(files) > 5:
        print(f"   ... and {len(files) - 5} more files")

💾 Saving results...
✅ Dataset saved as 'papers_before_2022_download_status.csv'

📊 Final Statistics:
   📚 Total papers before 2022: 182
   ✅ Successfully downloaded: 125
   ❌ Failed downloads: 57
   📈 Success rate: 68.7%

✅ Successfully Downloaded Papers:
--------------------------------------------------------------------------------
1. [2020] 1G laboratory-scale shaking table tests on reducti...
   File: 2020_1G_laboratory-scale_shaking_table_tests_on_reduction_of_liqu.pdf

2. [2020] A Case Study on Buckling Stability of Piles in Liq...
   File: 2020_A_Case_Study_on_Buckling_Stability_of_Piles_in_Liquefiable_G.pdf

3. [2021] A catastrophic flowslide that overrides a liquefie...
   File: 2021_A_catastrophic_flowslide_that_overrides_a_liquefied_substrat.pdf

4. [2009] A comparison study of engineering approaches for s...
   File: 2009_A_comparison_study_of_engineering_approaches_for_seismic_eva.pdf

5. [2020] A large-scale field test on sand compaction piles ...
   File: 2020_A_large-s

In [12]:
# Configuration for downloading ALL papers
print("🔧 Configuration for Full Download:")
print("="*80)
print("To download ALL papers before 2022, uncomment and run the following:")
print()
print("# WARNING: This will take several hours!")
print("# papers_final = bulk_download_scihub(")
print("#     papers_with_doi,")
print("#     download_folder='papers_before_2022',")
print("#     delay=5,  # 5 seconds delay to be respectful")
print("#     max_papers=None,  # Download ALL papers")
print("#     resume=True  # Skip already downloaded papers")
print("# )")
print()
print(f"📊 Ready to download: {len(papers_with_doi)} papers before 2022")
print(f"⏱️  Estimated time (5s delay): {(len(papers_with_doi) * 5) / 60:.0f} minutes")
print()
print("✅ NEW FEATURES:")
print("   • Resume capability: Skips already downloaded papers")
print("   • Progress tracking: Saves progress every 10 papers")
print("   • File validation: Checks file size before skipping")
print("   • Better statistics: Shows downloaded/skipped/failed counts")
print()
print("⚠️  Important Notes:")
print("   1. Check legal implications of Sci-Hub in your jurisdiction")
print("   2. Downloads may fail due to server issues")
print("   3. Be respectful with delays to avoid overwhelming servers")
print("   4. Monitor progress and stop if needed")
print("   5. You can restart anytime - already downloaded papers will be skipped")

# Uncomment the following lines to download ALL papers:
"""
papers_final = bulk_download_scihub(
    papers_with_doi,
    download_folder="papers_before_2022",
    delay=5,  # 5 seconds delay
    max_papers=None,  # Download ALL papers
    resume=True  # Skip already downloaded papers
)
"""

🔧 Configuration for Full Download:
To download ALL papers before 2022, uncomment and run the following:

# papers_final = bulk_download_scihub(
#     papers_with_doi,
#     download_folder='papers_before_2022',
#     delay=5,  # 5 seconds delay to be respectful
#     max_papers=None,  # Download ALL papers
#     resume=True  # Skip already downloaded papers
# )

📊 Ready to download: 182 papers before 2022
⏱️  Estimated time (5s delay): 15 minutes

✅ NEW FEATURES:
   • Resume capability: Skips already downloaded papers
   • Progress tracking: Saves progress every 10 papers
   • File validation: Checks file size before skipping
   • Better statistics: Shows downloaded/skipped/failed counts

⚠️  Important Notes:
   1. Check legal implications of Sci-Hub in your jurisdiction
   2. Downloads may fail due to server issues
   3. Be respectful with delays to avoid overwhelming servers
   4. Monitor progress and stop if needed
   5. You can restart anytime - already downloaded papers will be sk

'\npapers_final = bulk_download_scihub(\n    papers_with_doi,\n    download_folder="papers_before_2022",\n    delay=5,  # 5 seconds delay\n    max_papers=None,  # Download ALL papers\n    resume=True  # Skip already downloaded papers\n)\n'

In [13]:
# 📁 DOWNLOAD MANAGEMENT & RESUME UTILITIES

def check_existing_downloads(download_folder="papers_before_2022"):
    """Check and report on existing downloads"""
    if not os.path.exists(download_folder):
        print(f"📁 Folder {download_folder} does not exist")
        return
    
    all_files = os.listdir(download_folder)
    pdf_files = [f for f in all_files if f.endswith('.pdf')]
    
    if not pdf_files:
        print(f"📁 No PDF files found in {download_folder}")
        return
    
    print(f"📊 EXISTING DOWNLOADS REPORT")
    print("="*60)
    
    valid_files = []
    invalid_files = []
    total_size = 0
    
    for file in pdf_files:
        filepath = os.path.join(download_folder, file)
        file_size = os.path.getsize(filepath)
        total_size += file_size
        
        if file_size > 1000:  # Consider valid if > 1KB
            valid_files.append((file, file_size))
        else:
            invalid_files.append((file, file_size))
    
    print(f"📄 Total PDF files: {len(pdf_files)}")
    print(f"✅ Valid files (>1KB): {len(valid_files)}")
    print(f"❌ Invalid files (≤1KB): {len(invalid_files)}")
    print(f"💾 Total size: {total_size / (1024*1024):.1f} MB")
    
    if valid_files:
        print(f"\n✅ Valid Downloads (showing first 10):")
        for i, (file, size) in enumerate(valid_files[:10], 1):
            print(f"   {i:2d}. {file} ({size/1024:.0f} KB)")
        if len(valid_files) > 10:
            print(f"       ... and {len(valid_files) - 10} more valid files")
    
    if invalid_files:
        print(f"\n❌ Invalid Downloads (may need to be re-downloaded):")
        for file, size in invalid_files:
            print(f"   • {file} ({size} bytes)")
    
    return {
        'total': len(pdf_files),
        'valid': len(valid_files),
        'invalid': len(invalid_files),
        'total_size_mb': total_size / (1024*1024)
    }

def clean_invalid_downloads(download_folder="papers_before_2022", confirm=True):
    """Remove invalid/corrupted download files"""
    if not os.path.exists(download_folder):
        print(f"📁 Folder {download_folder} does not exist")
        return
    
    all_files = os.listdir(download_folder)
    pdf_files = [f for f in all_files if f.endswith('.pdf')]
    
    invalid_files = []
    for file in pdf_files:
        filepath = os.path.join(download_folder, file)
        file_size = os.path.getsize(filepath)
        if file_size <= 1000:  # Consider invalid if ≤1KB
            invalid_files.append(file)
    
    if not invalid_files:
        print("✅ No invalid files found!")
        return
    
    print(f"❌ Found {len(invalid_files)} invalid files:")
    for file in invalid_files:
        filepath = os.path.join(download_folder, file)
        size = os.path.getsize(filepath)
        print(f"   • {file} ({size} bytes)")
    
    if confirm:
        print(f"\n⚠️  To delete these files, call:")
        print(f"clean_invalid_downloads('{download_folder}', confirm=False)")
        return
    
    # Delete invalid files
    deleted = 0
    for file in invalid_files:
        try:
            filepath = os.path.join(download_folder, file)
            os.remove(filepath)
            deleted += 1
            print(f"   🗑️  Deleted: {file}")
        except Exception as e:
            print(f"   ❌ Failed to delete {file}: {str(e)}")
    
    print(f"\n✅ Deleted {deleted} invalid files")

# Check current downloads
print("🔧 Download management utilities loaded!")
print("💡 Usage:")
print("   check_existing_downloads() - See what you have downloaded")
print("   clean_invalid_downloads() - Remove corrupted files")
print()

# Automatically check existing downloads
check_existing_downloads()

🔧 Download management utilities loaded!
💡 Usage:
   check_existing_downloads() - See what you have downloaded
   clean_invalid_downloads() - Remove corrupted files

📊 EXISTING DOWNLOADS REPORT
📄 Total PDF files: 126
✅ Valid files (>1KB): 126
❌ Invalid files (≤1KB): 0
💾 Total size: 1078.1 MB

✅ Valid Downloads (showing first 10):
    1. 2000_Improvement_of_soft_ground_using_solidified_coal_ash_and_its.pdf (1125 KB)
    2. 2000_Soil_densification_due_to_static_sand_pile_installation_for_.pdf (1493 KB)
    3. 2000_Use_of_embedded_walls_for_mitigation_of_liquefaction-induced.pdf (2600 KB)
    4. 2003_Liquefaction_resistance_of_sand_deposit_improved_with_sand_c.pdf (1473 KB)
    5. 2003_Science_and_empiricism_in_pile_foundation_design.pdf (808 KB)
    6. 2004_Behavior_of_model_rafts_resting_on_pile-reinforced_sand.pdf (528 KB)
    7. 2004_Complementary_design_methodology_for_driven_piles_in_sand.pdf (500 KB)
    8. 2004_Stress_and_pore_pressure_changes_due_to_sand_compaction_pile.pdf (571 K

{'total': 126, 'valid': 126, 'invalid': 0, 'total_size_mb': 1078.1354160308838}

In [18]:
# 📋 CREATE "NOT DOWNLOADED" CSV - Papers from All Details that aren't in papers_before_2022 folder

def create_not_downloaded_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022", 
    output_csv="not_downloaded_papers.csv"
):
    """
    Create a CSV with papers from original dataset that haven't been downloaded
    
    Parameters:
    - original_csv: Path to the original cleaned dataset
    - download_folder: Folder containing downloaded papers
    - output_csv: Output CSV file name for papers not downloaded
    """
    
    print(f"📊 Creating 'Not Downloaded' CSV...")
    print("="*60)
    
    # Load original dataset
    print(f"📁 Loading original dataset: {original_csv}")
    try:
        original_data = pd.read_csv(original_csv)
        print(f"✅ Loaded {len(original_data)} papers from original dataset")
    except Exception as e:
        print(f"❌ Error loading {original_csv}: {str(e)}")
        return
    
    # Get list of downloaded papers from folder
    downloaded_papers_info = set()
    
    if os.path.exists(download_folder):
        downloaded_files = [f for f in os.listdir(download_folder) if f.endswith('.pdf')]
        valid_downloads = []
        
        for file in downloaded_files:
            filepath = os.path.join(download_folder, file)
            file_size = os.path.getsize(filepath)
            
            if file_size > 1000:  # Valid download
                valid_downloads.append(file)
                
                # Extract year and title info from filename
                # Format: YEAR_Title.pdf or YEAR_Title_1.pdf
                try:
                    # Remove .pdf extension
                    base_name = file.replace('.pdf', '')
                    
                    # Remove trailing numbers (like _1, _2, etc.)
                    import re
                    base_name = re.sub(r'_\d+$', '', base_name)
                    
                    # Split by first underscore to get year and title
                    if '_' in base_name:
                        year_part = base_name.split('_')[0]
                        title_part = '_'.join(base_name.split('_')[1:])
                        
                        # Convert underscores back to spaces for title matching
                        title_normalized = title_part.replace('_', ' ').lower().strip()
                        
                        downloaded_papers_info.add((year_part, title_normalized))
                        
                except Exception as e:
                    print(f"⚠️  Could not parse filename: {file}")
                    continue
        
        print(f"📁 Found {len(valid_downloads)} valid downloaded files in {download_folder}")
        print(f"🔍 Extracted info from {len(downloaded_papers_info)} files for matching")
        
    else:
        print(f"📁 Download folder '{download_folder}' does not exist")
        valid_downloads = []
    
    # Mark papers as downloaded if they match files in folder
    original_data['is_downloaded'] = False
    original_data['download_match_method'] = ""
    
    matched_count = 0
    
    for idx, row in original_data.iterrows():
        year = row.get('year')
        title = row.get('title', '')
        doi = row.get('doi', '')
        
        # Normalize title for comparison
        if pd.notna(title):
            title_normalized = str(title).lower().strip()
            title_normalized = re.sub(r'[^\w\s]', '', title_normalized)  # Remove special chars
            title_normalized = re.sub(r'\s+', ' ', title_normalized)     # Normalize spaces
            
            # Create year string
            year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
            
            # Check if this paper matches any downloaded file
            for downloaded_year, downloaded_title in downloaded_papers_info:
                if downloaded_year == year_str:
                    # Compare titles (allow partial matches for long titles)
                    if len(title_normalized) > 0 and len(downloaded_title) > 0:
                        # Check if titles have significant overlap
                        title_words = set(title_normalized.split())
                        downloaded_words = set(downloaded_title.split())
                        
                        if len(title_words) > 0:
                            overlap = len(title_words & downloaded_words) / len(title_words)
                            
                            if overlap > 0.6:  # 60% word overlap
                                original_data.loc[idx, 'is_downloaded'] = True
                                original_data.loc[idx, 'download_match_method'] = f"Title match ({overlap:.1%})"
                                matched_count += 1
                                break
    
    print(f"🔗 Matched {matched_count} papers with downloaded files")
    
    # Filter for papers NOT downloaded AND have valid DOIs
    not_downloaded = original_data[
        (original_data['is_downloaded'] == False) & 
        (original_data['doi'].notna()) & 
        (original_data['doi'].str.startswith('10.', na=False))
    ].copy()
    
    # Remove the temporary matching columns from output
    if 'is_downloaded' in not_downloaded.columns:
        not_downloaded = not_downloaded.drop(['is_downloaded', 'download_match_method'], axis=1)
    
    # Save the not downloaded papers
    try:
        not_downloaded.to_csv(output_csv, index=False)
        print(f"✅ Saved {len(not_downloaded)} not downloaded papers to: {output_csv}")
    except Exception as e:
        print(f"❌ Error saving {output_csv}: {str(e)}")
        return
    
    # Statistics
    total_original = len(original_data)
    papers_with_valid_doi = original_data[
        (original_data['doi'].notna()) & 
        (original_data['doi'].str.startswith('10.', na=False))
    ].shape[0]
    total_downloaded = matched_count
    total_not_downloaded = len(not_downloaded)
    
    print(f"\n📊 SUMMARY:")
    print(f"   📚 Total papers in original dataset: {total_original}")
    print(f"   🔗 Papers with valid DOI: {papers_with_valid_doi}")
    print(f"   ✅ Papers identified as downloaded: {total_downloaded}")
    print(f"   ❌ Papers NOT downloaded (with valid DOI): {total_not_downloaded}")
    print(f"   📈 Download coverage (of papers with DOI): {(total_downloaded/papers_with_valid_doi*100):.1f}%")
    print(f"   📋 Remaining papers to download: {total_not_downloaded}")
    
    # Show sample of not downloaded papers
    if len(not_downloaded) > 0:
        print(f"\n📋 Sample of NOT downloaded papers (all have valid DOIs):")
        for i, (_, row) in enumerate(not_downloaded.head(5).iterrows(), 1):
            title = row.get('title', 'Unknown Title')
            year = row.get('year', 'Unknown')
            doi = row.get('doi', 'No DOI')
            print(f"   {i}. [{year}] {title[:60]}...")
            print(f"      DOI: {doi}")
    else:
        print(f"\n🎉 All papers with valid DOIs have been downloaded!")
    
    return not_downloaded

# Run the function
print("🚀 Creating NOT DOWNLOADED papers CSV...")
not_downloaded_papers = create_not_downloaded_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022",
    output_csv="not_downloaded_papers2.csv"
)

🚀 Creating NOT DOWNLOADED papers CSV...
📊 Creating 'Not Downloaded' CSV...
📁 Loading original dataset: All details_cleaned.csv
✅ Loaded 357 papers from original dataset
📁 Found 152 valid downloaded files in papers_before_2022
🔍 Extracted info from 151 files for matching
🔗 Matched 81 papers with downloaded files
✅ Saved 169 not downloaded papers to: not_downloaded_papers2.csv

📊 SUMMARY:
   📚 Total papers in original dataset: 357
   🔗 Papers with valid DOI: 250
   ✅ Papers identified as downloaded: 81
   ❌ Papers NOT downloaded (with valid DOI): 169
   📈 Download coverage (of papers with DOI): 32.4%
   📋 Remaining papers to download: 169

📋 Sample of NOT downloaded papers (all have valid DOIs):
   1. [2020.0] 1G laboratory-scale shaking table tests on reduction of liqu...
      DOI: 10.1007/978-981-15-2184-3_82
   2. [2018.0] A case study on seismic response analysis of ground improved...
      DOI: 10.6310/jog.201812_13(4).5
   3. [2021.0] A catastrophic flowslide that overrides a liqu

In [17]:
# 🔍 VERIFY "NOT DOWNLOADED" CSV FILE

# Check if the file was created successfully
import os

csv_file = "not_downloaded_papers.csv"

if os.path.exists(csv_file):
    print(f"✅ File created successfully: {csv_file}")
    
    # Load and show basic info
    try:
        not_downloaded_df = pd.read_csv(csv_file)
        file_size = os.path.getsize(csv_file) / 1024  # KB
        
        print(f"📊 File Statistics:")
        print(f"   📄 File size: {file_size:.1f} KB")
        print(f"   📚 Number of papers: {len(not_downloaded_df)}")
        print(f"   📋 Columns: {list(not_downloaded_df.columns)}")
        
        # Show papers with DOI for potential future downloads
        if 'doi' in not_downloaded_df.columns:
            papers_with_doi_count = not_downloaded_df['doi'].notna().sum()
            valid_doi_count = not_downloaded_df[
                not_downloaded_df['doi'].notna() & 
                not_downloaded_df['doi'].str.startswith('10.', na=False)
            ].shape[0]
            
            print(f"   🔗 Papers with DOI: {papers_with_doi_count}")
            print(f"   ✅ Papers with valid DOI: {valid_doi_count}")
        
        # Show year distribution
        if 'year' in not_downloaded_df.columns:
            year_counts = not_downloaded_df['year'].value_counts().sort_index()
            print(f"\n📅 Year distribution of NOT downloaded papers:")
            for year, count in year_counts.head(10).items():
                if pd.notna(year):
                    print(f"   {int(year)}: {count} papers")
            if len(year_counts) > 10:
                print(f"   ... and {len(year_counts) - 10} more years")
        
        # Show sample entries
        print(f"\n📋 Sample entries (first 3):")
        for i, (_, row) in enumerate(not_downloaded_df.head(3).iterrows(), 1):
            title = row.get('title', 'Unknown Title')
            year = row.get('year', 'Unknown')
            doi = row.get('doi', 'No DOI')
            print(f"   {i}. [{year}] {title[:50]}...")
            print(f"      DOI: {doi}")
            print()
            
    except Exception as e:
        print(f"❌ Error reading the CSV file: {str(e)}")
        
else:
    print(f"❌ File not found: {csv_file}")

print("💡 Next Steps:")
print("   1. Use 'not_downloaded_papers.csv' for future download attempts")
print("   2. Filter by year, journal, or other criteria as needed")
print("   3. Try different download methods (Unpaywall, institutional access, etc.)")
print("   4. Consider contacting authors for papers not available online")

✅ File created successfully: not_downloaded_papers.csv
📊 File Statistics:
   📄 File size: 369.3 KB
   📚 Number of papers: 169
   📋 Columns: ['key', 'title', 'year', 'month', 'day', 'journal', 'issn', 'volume', 'issue', 'pages', 'authors', 'url', 'language', 'publisher', 'location', 'abstract', 'notes', 'doi', 'keywords', 'pubmed_id', 'pmc_id', 'PDF files']
   🔗 Papers with DOI: 169
   ✅ Papers with valid DOI: 169

📅 Year distribution of NOT downloaded papers:
   2000: 2 papers
   2005: 2 papers
   2006: 5 papers
   2007: 1 papers
   2008: 5 papers
   2009: 4 papers
   2010: 4 papers
   2011: 3 papers
   2012: 4 papers
   2013: 7 papers
   ... and 12 more years

📋 Sample entries (first 3):
   1. [2020.0] 1G laboratory-scale shaking table tests on reducti...
      DOI: 10.1007/978-981-15-2184-3_82

   2. [2018.0] A case study on seismic response analysis of groun...
      DOI: 10.6310/jog.201812_13(4).5

   3. [2021.0] A catastrophic flowslide that overrides a liquefie...
      DOI: 10.1

In [19]:
# 📋 CREATE "DOWNLOADED PAPERS" CSV - Papers in papers_before_2022 folder with metadata

def create_downloaded_papers_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022", 
    output_csv="downloaded_papers.csv"
):
    """
    Create a CSV with downloaded papers and their metadata from original dataset
    
    Parameters:
    - original_csv: Path to the original cleaned dataset
    - download_folder: Folder containing downloaded papers
    - output_csv: Output CSV file name for downloaded papers
    """
    
    print(f"📊 Creating 'Downloaded Papers' CSV...")
    print("="*60)
    
    # Load original dataset
    print(f"📁 Loading original dataset: {original_csv}")
    try:
        original_data = pd.read_csv(original_csv)
        print(f"✅ Loaded {len(original_data)} papers from original dataset")
    except Exception as e:
        print(f"❌ Error loading {original_csv}: {str(e)}")
        return
    
    # Get list of downloaded papers from folder
    downloaded_papers_list = []
    
    if not os.path.exists(download_folder):
        print(f"❌ Download folder '{download_folder}' does not exist")
        return
    
    downloaded_files = [f for f in os.listdir(download_folder) if f.endswith('.pdf')]
    valid_downloads = []
    
    print(f"📁 Checking {len(downloaded_files)} PDF files in {download_folder}")
    
    for file in downloaded_files:
        filepath = os.path.join(download_folder, file)
        file_size = os.path.getsize(filepath)
        
        if file_size > 1000:  # Valid download (>1KB)
            valid_downloads.append(file)
            
            # Extract basic info from filename
            file_info = {
                'filename': file,
                'file_size_kb': round(file_size / 1024, 1),
                'file_path': filepath
            }
            
            # Try to extract year and title from filename
            # Format: YEAR_Title.pdf or YEAR_Title_1.pdf
            try:
                base_name = file.replace('.pdf', '')
                
                # Remove trailing numbers (like _1, _2, etc.)
                import re
                base_name = re.sub(r'_\d+$', '', base_name)
                
                # Split by first underscore to get year and title
                if '_' in base_name:
                    year_part = base_name.split('_')[0]
                    title_part = '_'.join(base_name.split('_')[1:])
                    
                    # Convert underscores back to spaces for title
                    title_from_filename = title_part.replace('_', ' ')
                    
                    file_info['year_from_filename'] = year_part
                    file_info['title_from_filename'] = title_from_filename
                else:
                    file_info['year_from_filename'] = 'unknown'
                    file_info['title_from_filename'] = base_name
                    
            except Exception as e:
                file_info['year_from_filename'] = 'parse_error'
                file_info['title_from_filename'] = 'parse_error'
            
            downloaded_papers_list.append(file_info)
    
    print(f"✅ Found {len(valid_downloads)} valid downloaded files")
    
    if not downloaded_papers_list:
        print("❌ No valid downloaded papers found")
        return
    
    # Match downloaded papers with original dataset
    matched_papers = []
    unmatched_files = []
    
    print(f"🔍 Matching downloaded files with original dataset...")
    
    for file_info in downloaded_papers_list:
        year_from_file = file_info['year_from_filename']
        title_from_file = file_info['title_from_filename'].lower().strip()
        
        # Normalize title for comparison
        title_normalized = re.sub(r'[^\w\s]', '', title_from_file)
        title_normalized = re.sub(r'\s+', ' ', title_normalized)
        title_words = set(title_normalized.split())
        
        best_match = None
        best_score = 0
        
        # Try to find matching paper in original dataset
        for idx, row in original_data.iterrows():
            original_year = row.get('year')
            original_title = row.get('title', '')
            
            # Check year match first
            year_str = f"{int(original_year)}" if pd.notna(original_year) else "unknown"
            
            if year_from_file == year_str or year_from_file == 'unknown':
                # Compare titles
                if pd.notna(original_title) and len(str(original_title)) > 0:
                    orig_title_normalized = str(original_title).lower().strip()
                    orig_title_normalized = re.sub(r'[^\w\s]', '', orig_title_normalized)
                    orig_title_normalized = re.sub(r'\s+', ' ', orig_title_normalized)
                    orig_words = set(orig_title_normalized.split())
                    
                    if len(title_words) > 0 and len(orig_words) > 0:
                        # Calculate similarity score
                        overlap = len(title_words & orig_words)
                        union = len(title_words | orig_words)
                        score = overlap / union if union > 0 else 0
                        
                        # Also check if most words from filename are in original title
                        coverage = overlap / len(title_words) if len(title_words) > 0 else 0
                        
                        # Use the better score
                        final_score = max(score, coverage)
                        
                        if final_score > best_score and final_score > 0.4:  # 40% similarity threshold
                            best_match = row.copy()
                            best_score = final_score
        
        if best_match is not None:
            # Combine original metadata with file info
            combined_info = best_match.copy()
            combined_info['downloaded_filename'] = file_info['filename']
            combined_info['file_size_kb'] = file_info['file_size_kb']
            combined_info['file_path'] = file_info['file_path']
            combined_info['match_score'] = round(best_score, 3)
            combined_info['title_from_filename'] = file_info['title_from_filename']
            combined_info['year_from_filename'] = file_info['year_from_filename']
            
            matched_papers.append(combined_info)
        else:
            unmatched_files.append(file_info)
    
    print(f"✅ Matched {len(matched_papers)} files with original dataset")
    print(f"⚠️  {len(unmatched_files)} files could not be matched")
    
    # Create DataFrame for matched papers
    if matched_papers:
        downloaded_df = pd.DataFrame(matched_papers)
        
        # Reorder columns to put file info first
        file_columns = ['downloaded_filename', 'file_size_kb', 'match_score']
        other_columns = [col for col in downloaded_df.columns if col not in file_columns]
        downloaded_df = downloaded_df[file_columns + other_columns]
        
        # Save to CSV
        try:
            downloaded_df.to_csv(output_csv, index=False)
            print(f"✅ Saved {len(downloaded_df)} downloaded papers to: {output_csv}")
        except Exception as e:
            print(f"❌ Error saving {output_csv}: {str(e)}")
            return
        
        # Show statistics
        total_size_mb = downloaded_df['file_size_kb'].sum() / 1024
        avg_size_kb = downloaded_df['file_size_kb'].mean()
        
        print(f"\n📊 DOWNLOADED PAPERS STATISTICS:")
        print(f"   📄 Total downloaded papers: {len(downloaded_df)}")
        print(f"   💾 Total size: {total_size_mb:.1f} MB")
        print(f"   📊 Average file size: {avg_size_kb:.1f} KB")
        print(f"   🔗 Papers with DOI: {downloaded_df['doi'].notna().sum()}")
        
        # Year distribution
        if 'year' in downloaded_df.columns:
            year_counts = downloaded_df['year'].value_counts().sort_index()
            print(f"\n📅 Downloaded papers by year:")
            for year, count in year_counts.head(10).items():
                if pd.notna(year):
                    print(f"   {int(year)}: {count} papers")
            if len(year_counts) > 10:
                print(f"   ... and {len(year_counts) - 10} more years")
        
        # Show sample entries
        print(f"\n📋 Sample downloaded papers (first 3):")
        for i, (_, row) in enumerate(downloaded_df.head(3).iterrows(), 1):
            title = row.get('title', 'Unknown Title')
            year = row.get('year', 'Unknown')
            filename = row.get('downloaded_filename', 'Unknown file')
            size_kb = row.get('file_size_kb', 0)
            match_score = row.get('match_score', 0)
            
            print(f"   {i}. [{year}] {title[:50]}...")
            print(f"      File: {filename} ({size_kb} KB)")
            print(f"      Match confidence: {match_score:.1%}")
            print()
        
        # Show unmatched files if any
        if unmatched_files:
            print(f"\n⚠️  UNMATCHED FILES ({len(unmatched_files)}):")
            for file_info in unmatched_files[:5]:
                print(f"   • {file_info['filename']} ({file_info['file_size_kb']} KB)")
                print(f"     Extracted: [{file_info['year_from_filename']}] {file_info['title_from_filename'][:40]}...")
            if len(unmatched_files) > 5:
                print(f"   ... and {len(unmatched_files) - 5} more unmatched files")
        
        return downloaded_df
    
    else:
        print("❌ No papers could be matched with the original dataset")
        return None

# Run the function
print("🚀 Creating DOWNLOADED papers CSV...")
downloaded_papers = create_downloaded_papers_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022",
    output_csv="downloaded_papers.csv"
)

🚀 Creating DOWNLOADED papers CSV...
📊 Creating 'Downloaded Papers' CSV...
📁 Loading original dataset: All details_cleaned.csv
✅ Loaded 357 papers from original dataset
📁 Checking 152 PDF files in papers_before_2022
✅ Found 152 valid downloaded files
🔍 Matching downloaded files with original dataset...
✅ Matched 152 files with original dataset
⚠️  0 files could not be matched
✅ Saved 152 downloaded papers to: downloaded_papers.csv

📊 DOWNLOADED PAPERS STATISTICS:
   📄 Total downloaded papers: 152
   💾 Total size: 1148.2 MB
   📊 Average file size: 7735.2 KB
   🔗 Papers with DOI: 152

📅 Downloaded papers by year:
   2000: 4 papers
   2003: 2 papers
   2004: 3 papers
   2005: 4 papers
   2006: 4 papers
   2007: 3 papers
   2008: 8 papers
   2009: 5 papers
   2010: 3 papers
   2011: 8 papers
   ... and 14 more years

📋 Sample downloaded papers (first 3):
   1. [2000.0] Experimental study on deformation of soft clay imp...
      File: 2000_Experimental_study_on_deformation_of_soft_clay_impro

In [20]:
# 🔍 VERIFY "DOWNLOADED PAPERS" CSV

def verify_downloaded_papers_csv(csv_file="downloaded_papers.csv"):
    """Verify and analyze the downloaded papers CSV"""
    
    print(f"🔍 VERIFYING: {csv_file}")
    print("="*60)
    
    try:
        df = pd.read_csv(csv_file)
        print(f"✅ Successfully loaded CSV with {len(df)} entries")
        
        # Column information
        print(f"\n📊 COLUMN INFO:")
        print(f"   Total columns: {len(df.columns)}")
        print(f"   Main columns: {list(df.columns[:8])}")
        if len(df.columns) > 8:
            print(f"   + {len(df.columns) - 8} more columns...")
        
        # Data quality checks
        print(f"\n✅ DATA QUALITY:")
        print(f"   📄 Entries with filename: {df['downloaded_filename'].notna().sum()}")
        print(f"   📊 Entries with file size: {df['file_size_kb'].notna().sum()}")
        print(f"   🔗 Entries with DOI: {df['doi'].notna().sum()}")
        print(f"   📝 Entries with title: {df['title'].notna().sum()}")
        print(f"   📅 Entries with year: {df['year'].notna().sum()}")
        print(f"   🎯 Average match score: {df['match_score'].mean():.1%}")
        
        # File size statistics
        total_size_mb = df['file_size_kb'].sum() / 1024
        print(f"\n💾 FILE SIZE STATS:")
        print(f"   Total size: {total_size_mb:.1f} MB")
        print(f"   Average: {df['file_size_kb'].mean():.1f} KB")
        print(f"   Median: {df['file_size_kb'].median():.1f} KB")
        print(f"   Min: {df['file_size_kb'].min():.1f} KB")
        print(f"   Max: {df['file_size_kb'].max():.1f} KB")
        
        # Year distribution
        if 'year' in df.columns:
            year_counts = df['year'].value_counts().sort_index()
            print(f"\n📅 TOP YEARS BY COUNT:")
            for year, count in year_counts.head(5).items():
                if pd.notna(year):
                    print(f"   {int(year)}: {count} papers")
        
        # Match quality
        if 'match_score' in df.columns:
            high_confidence = df[df['match_score'] >= 0.8]
            medium_confidence = df[(df['match_score'] >= 0.5) & (df['match_score'] < 0.8)]
            low_confidence = df[df['match_score'] < 0.5]
            
            print(f"\n🎯 MATCH CONFIDENCE:")
            print(f"   High (≥80%): {len(high_confidence)} papers ({len(high_confidence)/len(df)*100:.1f}%)")
            print(f"   Medium (50-79%): {len(medium_confidence)} papers ({len(medium_confidence)/len(df)*100:.1f}%)")
            print(f"   Low (<50%): {len(low_confidence)} papers ({len(low_confidence)/len(df)*100:.1f}%)")
        
        # Sample entries
        print(f"\n📋 SAMPLE ENTRIES (first 3):")
        for i, (_, row) in enumerate(df.head(3).iterrows(), 1):
            title = row.get('title', 'No title')[:40]
            year = row.get('year', 'Unknown')
            filename = row.get('downloaded_filename', 'No filename')
            size_kb = row.get('file_size_kb', 0)
            doi = row.get('doi', 'No DOI')
            match_score = row.get('match_score', 0)
            
            print(f"   {i}. [{year}] {title}...")
            print(f"      File: {filename}")
            print(f"      Size: {size_kb:.1f} KB | Match: {match_score:.1%}")
            print(f"      DOI: {doi}")
            print()
        
        return df
        
    except FileNotFoundError:
        print(f"❌ File not found: {csv_file}")
        return None
    except Exception as e:
        print(f"❌ Error reading CSV: {str(e)}")
        return None

# Verify the downloaded papers CSV
downloaded_verification = verify_downloaded_papers_csv("downloaded_papers.csv")

🔍 VERIFYING: downloaded_papers.csv
✅ Successfully loaded CSV with 152 entries

📊 COLUMN INFO:
   Total columns: 28
   Main columns: ['downloaded_filename', 'file_size_kb', 'match_score', 'key', 'title', 'year', 'month', 'day']
   + 20 more columns...

✅ DATA QUALITY:
   📄 Entries with filename: 152
   📊 Entries with file size: 152
   🔗 Entries with DOI: 152
   📝 Entries with title: 152
   📅 Entries with year: 152
   🎯 Average match score: 92.0%

💾 FILE SIZE STATS:
   Total size: 1148.2 MB
   Average: 7735.2 KB
   Median: 1756.2 KB
   Min: 267.2 KB
   Max: 192110.3 KB

📅 TOP YEARS BY COUNT:
   2000: 4 papers
   2003: 2 papers
   2004: 3 papers
   2005: 4 papers
   2006: 4 papers

🎯 MATCH CONFIDENCE:
   High (≥80%): 152 papers (100.0%)
   Medium (50-79%): 0 papers (0.0%)
   Low (<50%): 0 papers (0.0%)

📋 SAMPLE ENTRIES (first 3):
   1. [2000.0] Experimental study on deformation of sof...
      File: 2000_Experimental_study_on_deformation_of_soft_clay_improved_by_l.pdf
      Size: 2411.6 

In [None]:
# 📋 CREATE "DOWNLOADED PAPERS" CSV - Papers in papers_before_2022 folder with metadata

def create_downloaded_papers_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022", 
    output_csv="downloaded_papers.csv"
):
    """
    Create a CSV with papers from original dataset that have been downloaded
    
    Parameters:
    - original_csv: Path to the original cleaned dataset
    - download_folder: Folder containing downloaded papers
    - output_csv: Output CSV file name for downloaded papers
    """
    
    print(f"📊 Creating 'Downloaded Papers' CSV...")
    print("="*60)
    
    # Load original dataset
    print(f"📁 Loading original dataset: {original_csv}")
    try:
        original_data = pd.read_csv(original_csv)
        print(f"✅ Loaded {len(original_data)} papers from original dataset")
    except Exception as e:
        print(f"❌ Error loading {original_csv}: {str(e)}")
        return
    
    # Get list of downloaded papers from folder
    downloaded_papers_info = []
    
    if os.path.exists(download_folder):
        downloaded_files = [f for f in os.listdir(download_folder) if f.endswith('.pdf')]
        valid_downloads = []
        
        for file in downloaded_files:
            filepath = os.path.join(download_folder, file)
            file_size = os.path.getsize(filepath)
            
            if file_size > 1000:  # Valid download
                valid_downloads.append(file)
                
                # Extract year and title info from filename
                # Format: YEAR_Title.pdf or YEAR_Title_1.pdf
                try:
                    # Remove .pdf extension
                    base_name = file.replace('.pdf', '')
                    
                    # Remove trailing numbers (like _1, _2, etc.)
                    import re
                    base_name = re.sub(r'_\d+$', '', base_name)
                    
                    # Split by first underscore to get year and title
                    if '_' in base_name:
                        year_part = base_name.split('_')[0]
                        title_part = '_'.join(base_name.split('_')[1:])
                        
                        # Convert underscores back to spaces for title matching
                        title_normalized = title_part.replace('_', ' ').lower().strip()
                        
                        downloaded_papers_info.append({
                            'filename': file,
                            'year_from_file': year_part,
                            'title_from_file': title_normalized,
                            'file_size_kb': file_size / 1024
                        })
                        
                except Exception as e:
                    print(f"⚠️  Could not parse filename: {file}")
                    downloaded_papers_info.append({
                        'filename': file,
                        'year_from_file': 'unknown',
                        'title_from_file': 'unknown',
                        'file_size_kb': file_size / 1024
                    })
                    continue
        
        print(f"📁 Found {len(valid_downloads)} valid downloaded files in {download_folder}")
        print(f"🔍 Extracted info from {len(downloaded_papers_info)} files for matching")
        
    else:
        print(f"📁 Download folder '{download_folder}' does not exist")
        return
    
    # Match downloaded files with original dataset
    downloaded_papers = []
    matched_count = 0
    unmatched_files = []
    
    for file_info in downloaded_papers_info:
        filename = file_info['filename']
        year_from_file = file_info['year_from_file']
        title_from_file = file_info['title_from_file']
        file_size_kb = file_info['file_size_kb']
        
        # Try to find matching paper in original dataset
        best_match = None
        best_score = 0
        
        for idx, row in original_data.iterrows():
            year = row.get('year')
            title = row.get('title', '')
            
            # Check year match first
            year_str = f"{int(year)}" if pd.notna(year) else "unknown_year"
            
            if year_from_file == year_str or year_from_file == "unknown":
                # Normalize title for comparison
                if pd.notna(title) and title_from_file != 'unknown':
                    title_normalized = str(title).lower().strip()
                    title_normalized = re.sub(r'[^\w\s]', '', title_normalized)  # Remove special chars
                    title_normalized = re.sub(r'\s+', ' ', title_normalized)     # Normalize spaces
                    
                    # Compare titles (calculate word overlap)
                    if len(title_normalized) > 0 and len(title_from_file) > 0:
                        title_words = set(title_normalized.split())
                        file_words = set(title_from_file.split())
                        
                        if len(title_words) > 0:
                            overlap = len(title_words & file_words) / len(title_words)
                            
                            if overlap > best_score and overlap > 0.5:  # At least 50% overlap
                                best_score = overlap
                                best_match = idx
        
        if best_match is not None:
            # Found a match - add to downloaded papers list
            row = original_data.iloc[best_match].copy()
            
            # Add download info
            row['downloaded_filename'] = filename
            row['file_size_kb'] = round(file_size_kb, 2)
            row['title_match_score'] = round(best_score, 3)
            row['download_status'] = 'Successfully Downloaded'
            
            downloaded_papers.append(row)
            matched_count += 1
            
        else:
            # No match found
            unmatched_files.append({
                'filename': filename,
                'year_from_file': year_from_file,
                'title_from_file': title_from_file,
                'file_size_kb': round(file_size_kb, 2)
            })
    
    print(f"🔗 Matched {matched_count} downloaded files with original dataset")
    print(f"❓ Unmatched files: {len(unmatched_files)}")
    
    # Create DataFrame from matched papers
    if downloaded_papers:
        downloaded_df = pd.DataFrame(downloaded_papers)
        
        # Sort by year and title
        downloaded_df = downloaded_df.sort_values(['year', 'title'], ascending=[True, True])
        
        # Save to CSV
        try:
            downloaded_df.to_csv(output_csv, index=False)
            print(f"✅ Saved {len(downloaded_df)} downloaded papers to: {output_csv}")
        except Exception as e:
            print(f"❌ Error saving {output_csv}: {str(e)}")
            return
    else:
        print(f"❌ No matches found between downloaded files and original dataset")
        return
    
    # Statistics
    total_downloaded_files = len(downloaded_papers_info)
    total_matched = len(downloaded_papers)
    total_unmatched = len(unmatched_files)
    total_size_mb = sum([info['file_size_kb'] for info in downloaded_papers_info]) / 1024
    
    print(f"\n📊 SUMMARY:")
    print(f"   📁 Total valid files in download folder: {total_downloaded_files}")
    print(f"   ✅ Files matched with original dataset: {total_matched}")
    print(f"   ❓ Files not matched: {total_unmatched}")
    print(f"   💾 Total download size: {total_size_mb:.1f} MB")
    print(f"   📈 Match rate: {(total_matched/total_downloaded_files*100):.1f}%")
    
    # Show sample of downloaded papers
    if len(downloaded_papers) > 0:
        print(f"\n📋 Sample of downloaded papers with metadata:")
        for i, row in enumerate(downloaded_df.head(5).iterrows(), 1):
            _, paper = row
            title = paper.get('title', 'Unknown Title')
            year = paper.get('year', 'Unknown')
            doi = paper.get('doi', 'No DOI')
            filename = paper.get('downloaded_filename', 'Unknown')
            size_kb = paper.get('file_size_kb', 0)
            match_score = paper.get('title_match_score', 0)
            
            print(f"   {i}. [{year}] {title[:50]}...")
            print(f"      DOI: {doi}")
            print(f"      File: {filename} ({size_kb:.0f} KB)")
            print(f"      Match Score: {match_score:.1%}")
            print()
    
    # Show unmatched files if any
    if unmatched_files:
        print(f"\n❓ Unmatched files (not found in original dataset):")
        for i, file_info in enumerate(unmatched_files[:5], 1):
            print(f"   {i}. {file_info['filename']} ({file_info['file_size_kb']:.0f} KB)")
            print(f"      Year: {file_info['year_from_file']}, Title: {file_info['title_from_file'][:40]}...")
        if len(unmatched_files) > 5:
            print(f"      ... and {len(unmatched_files) - 5} more unmatched files")
    
    print(f"\n💡 Use Cases for downloaded_papers.csv:")
    print(f"   📊 Analyze what types of papers you've successfully downloaded")
    print(f"   📅 Track download progress by year, journal, or topic")
    print(f"   🔍 Find papers you have locally when needed")
    print(f"   📈 Generate reports on your downloaded paper collection")
    
    return downloaded_df

# Run the function
print("🚀 Creating DOWNLOADED papers CSV...")
downloaded_papers_df = create_downloaded_papers_csv(
    original_csv="All details_cleaned.csv",
    download_folder="papers_before_2022",
    output_csv="downloaded_papers.csv"
)