In [None]:
import pandas as pd
data=pd.read_csv("All details.csv")
data.columns

Index(['key', 'title', 'year', 'month', 'day', 'journal', 'issn', 'volume',
       'issue', 'pages', 'authors', 'url', 'language', 'publisher', 'location',
       'abstract', 'notes', 'doi', 'keywords', 'pubmed_id', 'pmc_id',
       'PDF files'],
      dtype='object')

In [3]:
# Examine the DOI column to understand the format
print("Sample DOI entries:")
print(data['doi'].head(10))
print("\nDOI column info:")
print(f"Total entries: {len(data)}")
print(f"Non-null DOI entries: {data['doi'].notna().sum()}")
print(f"Null DOI entries: {data['doi'].isna().sum()}")

Sample DOI entries:
0    10.1007/978-981-15-2184-3_82     WE  - Confere...
1                                                  NaN
2    10.1007/978-3-030-34252-4_8     WE  - Conferen...
3                           10.6310/jog.201812_13(4).5
4                                                  NaN
5                                     10.1002/esp.5144
6                                 10.1061/41050(357)97
7                            10.1504/IJCAT.2020.107429
8    10.12989/gae.2016.10.3.315     WE  - Science C...
9                       10.1016/j.oceaneng.2023.115331
Name: doi, dtype: object

DOI column info:
Total entries: 357
Non-null DOI entries: 251
Null DOI entries: 106


In [6]:
# Clean the DOI column by extracting only the DOI part (before first space)
def clean_doi(doi_text):
    """Extract only the DOI from the text, removing extra information after spaces"""
    if pd.isna(doi_text):
        return doi_text  # Keep NaN values as they are
    
    # Convert to string and split by space, take only the first part (the actual DOI)
    cleaned_doi = str(doi_text).split()[0]
    
    # Additional validation: DOI should start with "10."
    if cleaned_doi.startswith('10.'):
        return cleaned_doi
    else:
        return doi_text  # Return original if it doesn't look like a proper DOI

# Apply the cleaning function to create a new column
data['doi_cleaned'] = data['doi'].apply(clean_doi)

# Show comparison of original vs cleaned DOIs
print("Comparison of original vs cleaned DOIs:")
print("="*80)
comparison_df = pd.DataFrame({
    'Original DOI': data['doi'].head(15),
    'Cleaned DOI': data['doi_cleaned'].head(15)
})
print(comparison_df.to_string(index=False))

print(f"\nSummary:")
print(f"Original DOI entries with spaces: {data['doi'].str.contains(' ', na=False).sum()}")
print(f"Cleaned DOI entries: {data['doi_cleaned'].notna().sum()}")
print(f"DOIs that start with '10.': {data['doi_cleaned'].str.startswith('10.', na=False).sum()}")

Comparison of original vs cleaned DOIs:
                                                                                   Original DOI                    Cleaned DOI
10.1007/978-981-15-2184-3_82     WE  - Conference Proceedings Citation Index - Science (CPCI-S)   10.1007/978-981-15-2184-3_82
                                                                                            NaN                            NaN
 10.1007/978-3-030-34252-4_8     WE  - Conference Proceedings Citation Index - Science (CPCI-S)    10.1007/978-3-030-34252-4_8
                                                                     10.6310/jog.201812_13(4).5     10.6310/jog.201812_13(4).5
                                                                                            NaN                            NaN
                                                                               10.1002/esp.5144               10.1002/esp.5144
                                                                       

In [7]:
# Show specific examples of the cleaning
print("Examples of DOI cleaning:")
print("="*80)

# Find entries that had extra text (spaces in original)
entries_with_spaces = data[data['doi'].str.contains(' ', na=False)]
print(f"Found {len(entries_with_spaces)} entries with extra text after DOI")
print("\nBefore and after examples:")

for i, (idx, row) in enumerate(entries_with_spaces.head(5).iterrows()):
    print(f"{i+1}. Original: {row['doi']}")
    print(f"   Cleaned:  {row['doi_cleaned']}")
    print()

# Replace the original doi column with the cleaned version
data['doi'] = data['doi_cleaned']
# Drop the temporary cleaned column
data = data.drop('doi_cleaned', axis=1)

print("✅ DOI column has been cleaned successfully!")
print(f"📊 Statistics:")
print(f"   - Total entries: {len(data)}")
print(f"   - Entries with DOI: {data['doi'].notna().sum()}")
print(f"   - Valid DOIs (starting with '10.'): {data['doi'].str.startswith('10.', na=False).sum()}")

# Save the cleaned data
data.to_csv("All details_cleaned.csv", index=False)
print(f"\n💾 Cleaned data saved as 'All details_cleaned.csv'")

Examples of DOI cleaning:
Found 124 entries with extra text after DOI

Before and after examples:
1. Original: 10.1007/978-981-15-2184-3_82     WE  - Conference Proceedings Citation Index - Science (CPCI-S)
   Cleaned:  10.1007/978-981-15-2184-3_82

2. Original: 10.1007/978-3-030-34252-4_8     WE  - Conference Proceedings Citation Index - Science (CPCI-S)
   Cleaned:  10.1007/978-3-030-34252-4_8

3. Original: 10.12989/gae.2016.10.3.315     WE  - Science Citation Index Expanded (SCI-EXPANDED)
   Cleaned:  10.12989/gae.2016.10.3.315

4. Original: 10.1139/T08-050     WE  - Science Citation Index Expanded (SCI-EXPANDED)
   Cleaned:  10.1139/T08-050

5. Original: 10.1520/GTJ20170217     WE  - Science Citation Index Expanded (SCI-EXPANDED)
   Cleaned:  10.1520/GTJ20170217

✅ DOI column has been cleaned successfully!
📊 Statistics:
   - Total entries: 357
   - Entries with DOI: 251
   - Valid DOIs (starting with '10.'): 250

💾 Cleaned data saved as 'All details_cleaned.csv'


In [8]:
data["doi"].head(10)  

0      10.1007/978-981-15-2184-3_82
1                               NaN
2       10.1007/978-3-030-34252-4_8
3        10.6310/jog.201812_13(4).5
4                               NaN
5                  10.1002/esp.5144
6              10.1061/41050(357)97
7         10.1504/IJCAT.2020.107429
8        10.12989/gae.2016.10.3.315
9    10.1016/j.oceaneng.2023.115331
Name: doi, dtype: object

# 📚 Bulk Paper Download System

This section implements a comprehensive system to:
1. Filter papers that have valid DOIs
2. Attempt to download papers from multiple sources
3. Rename files using paper titles from the dataset
4. Track download status in the dataset
5. Save everything in an organized folder structure

In [10]:
# Import required libraries for downloading
import requests
import time
import os
import re
from urllib.parse import quote
import pandas as pd
from pathlib import Path

# Helper function to clean filename
def clean_filename(title, max_length=100):
    """Clean paper title to create a valid filename"""
    if pd.isna(title):
        return "Unknown_Title"
    
    # Remove invalid characters for Windows filenames
    title = str(title)
    title = re.sub(r'[<>:"/\\|?*]', '', title)
    title = re.sub(r'[^\w\s\-.]', '', title)
    title = re.sub(r'\s+', '_', title.strip())
    
    # Limit length
    if len(title) > max_length:
        title = title[:max_length]
    
    return title if title else "Unknown_Title"

# Test the filename cleaning function
print("Testing filename cleaning:")
sample_titles = [
    "A study of heat transfer: effects on chemical processes",
    "Optimization of reactor design (Part 1)",
    "Mass transfer in porous media - A review",
    None,
    "Very long title that needs to be truncated because it exceeds the maximum allowed length for filenames"
]

for title in sample_titles:
    cleaned = clean_filename(title)
    print(f"Original: {title}")
    print(f"Cleaned:  {cleaned}")
    print("-" * 50)

Testing filename cleaning:
Original: A study of heat transfer: effects on chemical processes
Cleaned:  A_study_of_heat_transfer_effects_on_chemical_processes
--------------------------------------------------
Original: Optimization of reactor design (Part 1)
Cleaned:  Optimization_of_reactor_design_Part_1
--------------------------------------------------
Original: Mass transfer in porous media - A review
Cleaned:  Mass_transfer_in_porous_media_-_A_review
--------------------------------------------------
Original: None
Cleaned:  Unknown_Title
--------------------------------------------------
Original: Very long title that needs to be truncated because it exceeds the maximum allowed length for filenames
Cleaned:  Very_long_title_that_needs_to_be_truncated_because_it_exceeds_the_maximum_allowed_length_for_filenam
--------------------------------------------------


In [11]:
# Step 1: Create a copy of the dataset and filter out entries without DOIs
data_copy = data.copy()

# Filter out entries where DOI is missing or invalid
print("Filtering dataset for valid DOIs...")
print(f"Original dataset size: {len(data_copy)}")

# Remove entries with NaN DOIs
data_filtered = data_copy[data_copy['doi'].notna()].copy()
print(f"After removing NaN DOIs: {len(data_filtered)}")

# Remove entries that don't start with "10." (invalid DOI format)
data_filtered = data_filtered[data_filtered['doi'].str.startswith('10.', na=False)].copy()
print(f"After removing invalid DOI formats: {len(data_filtered)}")

# Add download status column
data_filtered['downloaded'] = False
data_filtered['download_filename'] = ""
data_filtered['download_status'] = "Not attempted"

print(f"\n✅ Filtered dataset ready with {len(data_filtered)} papers to download")
print(f"Sample DOIs to download:")
for i, doi in enumerate(data_filtered['doi'].head(5)):
    print(f"  {i+1}. {doi}")

Filtering dataset for valid DOIs...
Original dataset size: 357
After removing NaN DOIs: 251
After removing invalid DOI formats: 250

✅ Filtered dataset ready with 250 papers to download
Sample DOIs to download:
  1. 10.1007/978-981-15-2184-3_82
  2. 10.1007/978-3-030-34252-4_8
  3. 10.6310/jog.201812_13(4).5
  4. 10.1002/esp.5144
  5. 10.1061/41050(357)97


In [12]:
# Step 2: Download functions with multiple sources

def check_open_access(doi):
    """Check if paper is open access using Unpaywall API"""
    try:
        # Using a generic email - replace with your actual email for better results
        url = f"https://api.unpaywall.org/v2/{doi}?email=student@university.edu"
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            if data.get('is_oa', False):
                oa_location = data.get('best_oa_location', {})
                pdf_url = oa_location.get('url_for_pdf') or oa_location.get('host_type_set')
                return True, pdf_url
        
        return False, None
    except Exception as e:
        print(f"   Unpaywall API error: {str(e)}")
        return False, None

def download_from_scihub(doi):
    """Attempt to download from Sci-Hub (check legal implications in your jurisdiction)"""
    scihub_urls = [
        "https://sci-hub.se/",
        "https://sci-hub.st/", 
        "https://sci-hub.ru/",
        "https://sci-hub.wf/"
    ]
    
    for base_url in scihub_urls:
        try:
            url = f"{base_url}{doi}"
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
            
            # Check if we got a PDF response
            content_type = response.headers.get('content-type', '').lower()
            if response.status_code == 200 and ('pdf' in content_type or len(response.content) > 10000):
                return response.content
                
        except Exception as e:
            continue  # Try next URL
    
    return None

def download_paper(doi, title, download_folder="downloaded_papers"):
    """Download a single paper using multiple methods"""
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    print(f"   Attempting to download: {doi}")
    
    # Method 1: Check Open Access first
    print("   Checking Open Access...")
    is_oa, oa_url = check_open_access(doi)
    
    if is_oa and oa_url:
        try:
            response = requests.get(oa_url, timeout=20)
            if response.status_code == 200:
                content = response.content
                source = "Open Access"
            else:
                content = None
        except:
            content = None
    else:
        content = None
    
    # Method 2: Try Sci-Hub if Open Access failed
    if content is None:
        print("   Trying Sci-Hub...")
        content = download_from_scihub(doi)
        source = "Sci-Hub"
    
    # Save the file if we got content
    if content and len(content) > 1000:  # Minimum size check
        # Create filename from title
        clean_title = clean_filename(title, max_length=80)
        filename = f"{clean_title}.pdf"
        filepath = os.path.join(download_folder, filename)
        
        # Handle duplicate filenames
        counter = 1
        base_filepath = filepath
        while os.path.exists(filepath):
            name, ext = os.path.splitext(base_filepath)
            filepath = f"{name}_{counter}{ext}"
            counter += 1
        
        try:
            with open(filepath, 'wb') as f:
                f.write(content)
            
            print(f"   ✅ Downloaded successfully from {source}")
            return True, os.path.basename(filepath), f"Downloaded from {source}"
        except Exception as e:
            print(f"   ❌ Error saving file: {str(e)}")
            return False, "", f"Error saving: {str(e)}"
    
    print(f"   ❌ Download failed")
    return False, "", "Download failed - no sources available"

# Test download function with one paper
print("Testing download function with first DOI...")
test_doi = data_filtered['doi'].iloc[0]
test_title = data_filtered['title'].iloc[0] if 'title' in data_filtered.columns else "Test Paper"

success, filename, status = download_paper(test_doi, test_title)
print(f"Test result: Success={success}, Filename={filename}, Status={status}")

Testing download function with first DOI...
   Attempting to download: 10.1007/978-981-15-2184-3_82
   Checking Open Access...
   Trying Sci-Hub...
   ❌ Download failed
Test result: Success=False, Filename=, Status=Download failed - no sources available


In [13]:
# Step 3: Main bulk download function
def bulk_download_papers(data_filtered, download_folder="downloaded_papers", max_papers=None, delay=3):
    """
    Bulk download papers from filtered dataset
    
    Parameters:
    - data_filtered: DataFrame with valid DOIs
    - download_folder: Folder to save downloaded papers
    - max_papers: Maximum number of papers to download (None for all)
    - delay: Delay between downloads in seconds
    """
    
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    # Limit papers if specified
    if max_papers:
        papers_to_download = data_filtered.head(max_papers).copy()
    else:
        papers_to_download = data_filtered.copy()
    
    print(f"🚀 Starting bulk download of {len(papers_to_download)} papers...")
    print(f"📁 Download folder: {download_folder}")
    print(f"⏱️  Delay between downloads: {delay} seconds")
    print("="*80)
    
    successful_downloads = 0
    failed_downloads = 0
    
    for index, (idx, row) in enumerate(papers_to_download.iterrows(), 1):
        doi = row['doi']
        title = row.get('title', f"Paper_{index}")
        
        print(f"\n[{index}/{len(papers_to_download)}] Processing: {title[:60]}...")
        print(f"DOI: {doi}")
        
        # Attempt download
        success, filename, status = download_paper(doi, title, download_folder)
        
        # Update the filtered dataset
        data_filtered.loc[idx, 'downloaded'] = success
        data_filtered.loc[idx, 'download_filename'] = filename
        data_filtered.loc[idx, 'download_status'] = status
        
        if success:
            successful_downloads += 1
        else:
            failed_downloads += 1
        
        # Progress update
        print(f"   Status: {status}")
        print(f"   Progress: {successful_downloads} successful, {failed_downloads} failed")
        
        # Add delay between downloads
        if index < len(papers_to_download):
            print(f"   Waiting {delay} seconds before next download...")
            time.sleep(delay)
    
    print("\n" + "="*80)
    print(f"🎉 Bulk download completed!")
    print(f"✅ Successful downloads: {successful_downloads}")
    print(f"❌ Failed downloads: {failed_downloads}")
    print(f"📊 Success rate: {(successful_downloads/(successful_downloads+failed_downloads)*100):.1f}%")
    
    return data_filtered

# Ask user for download preferences
print("📋 Download Configuration:")
print(f"Total papers available for download: {len(data_filtered)}")
print("\\nOptions:")
print("1. Download all papers (could take a long time)")
print("2. Download first 10 papers (for testing)")
print("3. Download first 50 papers")
print("4. Custom number")

# For demo purposes, let's start with a small number
test_download_count = 5
print(f"\\n🔧 For demonstration, downloading first {test_download_count} papers...")

📋 Download Configuration:
Total papers available for download: 250
\nOptions:
1. Download all papers (could take a long time)
2. Download first 10 papers (for testing)
3. Download first 50 papers
4. Custom number
\n🔧 For demonstration, downloading first 5 papers...


In [14]:
# Step 4: Execute bulk download (start with limited number for testing)
print("🚀 Starting bulk download process...")
print("⚠️  Starting with a small test batch. Modify 'max_papers' parameter to download more.")

# Run bulk download with first 10 papers
data_with_downloads = bulk_download_papers(
    data_filtered, 
    download_folder="downloaded_papers", 
    max_papers=10,  # Change this number or set to None for all papers
    delay=3  # 3 seconds delay between downloads
)

🚀 Starting bulk download process...
⚠️  Starting with a small test batch. Modify 'max_papers' parameter to download more.
🚀 Starting bulk download of 10 papers...
📁 Download folder: downloaded_papers
⏱️  Delay between downloads: 3 seconds

[1/10] Processing: 1G laboratory-scale shaking table tests on reduction of liqu...
DOI: 10.1007/978-981-15-2184-3_82
   Attempting to download: 10.1007/978-981-15-2184-3_82
   Checking Open Access...
   Trying Sci-Hub...
   ❌ Download failed
   Status: Download failed - no sources available
   Progress: 0 successful, 1 failed
   Waiting 3 seconds before next download...

[2/10] Processing: A Case Study on Buckling Stability of Piles in Liquefiable G...
DOI: 10.1007/978-3-030-34252-4_8
   Attempting to download: 10.1007/978-3-030-34252-4_8
   Checking Open Access...
   Trying Sci-Hub...
   ❌ Download failed
   Status: Download failed - no sources available
   Progress: 0 successful, 2 failed
   Waiting 3 seconds before next download...

[3/10] Process

In [None]:
# Step 5: Save the updated dataset with download status
print("\n📊 Download Results Summary:")
print("="*80)

# Show download statistics
downloaded_count = data_with_downloads['downloaded'].sum()
total_attempted = len(data_with_downloads)

print(f"📈 Download Statistics:")
print(f"   - Total papers with valid DOIs: {total_attempted}")
print(f"   - Successfully downloaded: {downloaded_count}")
print(f"   - Failed downloads: {total_attempted - downloaded_count}")
print(f"   - Success rate: {(downloaded_count/total_attempted*100):.1f}%")

# Show download status breakdown
status_counts = data_with_downloads['download_status'].value_counts()
print(f"\n📋 Download Status Breakdown:")
for status, count in status_counts.items():
    print(f"   - {status}: {count}")

# Save the dataset with download information
data_with_downloads.to_csv("papers_with_download_status.csv", index=False)
print(f"\n💾 Updated dataset saved as 'papers_with_download_status.csv'")

# Show sample of downloaded papers
downloaded_papers = data_with_downloads[data_with_downloads['downloaded'] == True]
if len(downloaded_papers) > 0:
    print(f"\n✅ Successfully Downloaded Papers:")
    print("-" * 80)
    for i, (_, row) in enumerate(downloaded_papers.head(5).iterrows(), 1):
        title = row.get('title', 'Unknown Title')
        filename = row['download_filename']
        print(f"{i}. {title[:60]}...")
        print(f"   File: {filename}")
        print(f"   DOI: {row['doi']}")
        print()

# Show folder structure
print(f"\n📁 Downloaded papers are saved in the 'downloaded_papers' folder")
if os.path.exists("downloaded_papers"):
    files_in_folder = os.listdir("downloaded_papers")
    print(f"   Total files in folder: {len(files_in_folder)}")
    for file in files_in_folder[:5]:  # Show first 5 files
        print(f"   - {file}")
    if len(files_in_folder) > 5:
        print(f"   ... and {len(files_in_folder) - 5} more files")
else:
    print("   Folder not created yet (no successful downloads)")

In [None]:
# Step 6: Configuration for downloading more papers
print("🔧 Configuration for Bulk Download:")
print("="*80)
print("To download more papers, modify and run the following cell:")
print()
print("# Uncomment and modify the following lines to download more papers:")
print("# For all papers:")
print("# data_final = bulk_download_papers(data_filtered, max_papers=None, delay=5)")
print()
print("# For specific number of papers:")
print("# data_final = bulk_download_papers(data_filtered, max_papers=50, delay=3)")
print()
print("# For faster downloads (less delay, but be respectful to servers):")
print("# data_final = bulk_download_papers(data_filtered, max_papers=20, delay=2)")

print(f"\n📊 Current Status:")
print(f"   - Papers available for download: {len(data_filtered)}")
print(f"   - Papers already processed: {len(data_with_downloads)}")
print(f"   - Successfully downloaded: {data_with_downloads['downloaded'].sum()}")

print(f"\n⚠️  Important Notes:")
print("   1. Downloading all papers may take several hours")
print("   2. Some downloads may fail due to paywalls or server issues")
print("   3. Be respectful with download delays to avoid overwhelming servers")
print("   4. Check legal implications of using Sci-Hub in your jurisdiction")
print("   5. Consider using your institutional access for better results")

# Example configuration for downloading more papers (commented out)
"""
# Uncomment the following to download all papers:
data_final = bulk_download_papers(
    data_filtered, 
    download_folder="downloaded_papers", 
    max_papers=None,  # Set to None for all papers
    delay=5  # Increase delay for more respectful downloading
)
"""