# EDA and undertanding

In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.4


In [2]:
import pandas as pd
import numpy as np

# Load training labels
train_labels = pd.read_csv("/kaggle/input/make-data-count-finding-data-references/train_labels.csv")

print("=== TRAINING DATA OVERVIEW ===")
print(f"Shape: {train_labels.shape}")
print(f"Rows: {len(train_labels)}, Columns: {len(train_labels.columns)}")
print()

print("=== COLUMN NAMES ===")
print(train_labels.columns.tolist())
print()

print("=== FIRST 5 ROWS ===")
print(train_labels.head())
print()

print("=== DATA TYPES ===")
print(train_labels.dtypes)
print()

print("=== MISSING VALUES ===")
print(train_labels.isnull().sum())


=== TRAINING DATA OVERVIEW ===
Shape: (1066, 3)
Rows: 1066, Columns: 3

=== COLUMN NAMES ===
['article_id', 'dataset_id', 'type']

=== FIRST 5 ROWS ===
               article_id                                 dataset_id     type
0    10.1002_2017jc013030             https://doi.org/10.17882/49388  Primary
1  10.1002_anie.201916483  https://doi.org/10.5517/ccdc.csd.cc1npvt0  Missing
2  10.1002_anie.202005531  https://doi.org/10.5517/ccdc.csd.cc24wxqp  Missing
3  10.1002_anie.202007717  https://doi.org/10.5517/ccdc.csd.cc24rrb0  Missing
4  10.1002_chem.201902131  https://doi.org/10.5517/ccdc.csd.cc221dk3  Missing

=== DATA TYPES ===
article_id    object
dataset_id    object
type          object
dtype: object

=== MISSING VALUES ===
article_id    0
dataset_id    0
type          0
dtype: int64


In [3]:
# Let's examine the target variables
print("=== CITATION TYPES ===")
print(train_labels['type'].value_counts())
print()
print("Type distribution:")
print(train_labels['type'].value_counts(normalize=True))
print()

print("=== DATASET ID PATTERNS ===")
print(f"Total unique datasets: {train_labels['dataset_id'].nunique()}")
print("Most common datasets:")
print(train_labels['dataset_id'].value_counts().head(10))
print()

print("=== ARTICLE ID PATTERNS ===")
print(f"Total unique articles: {train_labels['article_id'].nunique()}")
print("Articles with most citations:")
print(train_labels['article_id'].value_counts().head(10))


=== CITATION TYPES ===
type
Secondary    449
Missing      347
Primary      270
Name: count, dtype: int64

Type distribution:
type
Secondary    0.421201
Missing      0.325516
Primary      0.253283
Name: proportion, dtype: float64

=== DATASET ID PATTERNS ===
Total unique datasets: 1066
Most common datasets:
dataset_id
https://doi.org/10.17882/49388           1
PRJNA167259                              1
https://doi.org/10.5061/dryad.m4r46      1
https://doi.org/10.5061/dryad.27m63      1
https://doi.org/10.5061/dryad.27m63.1    1
https://doi.org/10.5061/dryad.27m63.2    1
https://doi.org/10.5061/dryad.kh186      1
CP013147                                 1
PRJNA10687                               1
PRJNA16146                               1
Name: count, dtype: int64

=== ARTICLE ID PATTERNS ===
Total unique articles: 523
Articles with most citations:
article_id
10.3390_v11060565               32
10.1038_s41396-020-00885-8      31
10.1128_spectrum.00422-24       29
10.1371_journal.pone.01

In [4]:
# Let's categorize the types of dataset IDs we see
def categorize_dataset_id(dataset_id):
    """Categorize dataset IDs by their pattern"""
    if pd.isna(dataset_id):
        return "Missing"
    
    dataset_id = str(dataset_id)
    
    if dataset_id.startswith('http'):
        return "Full URL"
    elif dataset_id.startswith('doi:') or 'doi.org' in dataset_id or dataset_id.startswith('10.'):
        return "DOI"
    elif dataset_id.startswith('GSE'):
        return "GEO (Gene Expression Omnibus)"
    elif dataset_id.startswith('E-'):
        return "ArrayExpress"
    elif dataset_id.startswith('PRJ'):
        return "ENA (European Nucleotide Archive)"
    elif dataset_id.startswith('PDB'):
        return "PDB (Protein Data Bank)"
    elif dataset_id.startswith('CHEMBL'):
        return "ChEMBL"
    elif 'zenodo' in dataset_id.lower():
        return "Zenodo"
    elif 'figshare' in dataset_id.lower():
        return "Figshare"
    elif 'dryad' in dataset_id.lower():
        return "Dryad"
    else:
        return "Other"

# Apply categorization
train_labels['dataset_category'] = train_labels['dataset_id'].apply(categorize_dataset_id)

print("=== DATASET ID CATEGORIES ===")
category_counts = train_labels['dataset_category'].value_counts()
for category, count in category_counts.items():
    print(f"{category:30s}: {count:4d} ({count/len(train_labels)*100:.1f}%)")


=== DATASET ID CATEGORIES ===
Full URL                      :  668 (62.7%)
Other                         :  303 (28.4%)
ArrayExpress                  :   37 (3.5%)
ChEMBL                        :   29 (2.7%)
ENA (European Nucleotide Archive):   26 (2.4%)
GEO (Gene Expression Omnibus) :    3 (0.3%)


In [5]:
# Let's also examine article IDs
print("\n=== ARTICLE ID FORMATS ===")
sample_articles = train_labels['article_id'].sample(10, random_state=42)
print("Sample article IDs:")
for i, article in enumerate(sample_articles, 1):
    print(f"{i:2d}. {article}")



=== ARTICLE ID FORMATS ===
Sample article IDs:
 1. 10.1002_mp.14424
 2. 10.1590_1809-6891v21e-43578
 3. 10.1128_spectrum.00422-24
 4. 10.7554_elife.62329
 5. 10.1590_1679-78255326
 6. 10.1371_journal.pone.0220399
 7. 10.1186_s13007-019-0403-2
 8. 10.1186_s12977-015-0204-2
 9. 10.1029_2020jf005675
10. 10.1128_spectrum.00422-24


In [6]:
import os

train_dir = "/kaggle/input/make-data-count-finding-data-references/train"
test_dir = "/kaggle/input/make-data-count-finding-data-references/test"

print("=== TRAINING DIRECTORY ===")
train_files = os.listdir(train_dir)
print(f"Total files in train directory: {len(train_files)}")
print("Sample files:")
for i, file in enumerate(train_files[:10], 1):
    print(f"{i:2d}. {file}")

print("\n=== TEST DIRECTORY ===")
test_files = os.listdir(test_dir)
print(f"Total files in test directory: {len(test_files)}")
print("Sample files:")
for i, file in enumerate(test_files[:10], 1):
    print(f"{i:2d}. {file}")

# Check file extensions
train_extensions = [os.path.splitext(f)[1] for f in train_files]
test_extensions = [os.path.splitext(f)[1] for f in test_files]

print(f"\nTrain file extensions: {set(train_extensions)}")
print(f"Test file extensions: {set(test_extensions)}")


=== TRAINING DIRECTORY ===
Total files in train directory: 2
Sample files:
 1. XML
 2. PDF

=== TEST DIRECTORY ===
Total files in test directory: 2
Sample files:
 1. XML
 2. PDF

Train file extensions: {''}
Test file extensions: {''}


In [7]:
# How many citations per article?
citations_per_article = train_labels['article_id'].value_counts()
print("=== CITATIONS PER ARTICLE ===")
print(f"Min citations per article: {citations_per_article.min()}")
print(f"Max citations per article: {citations_per_article.max()}")
print(f"Mean citations per article: {citations_per_article.mean():.2f}")
print(f"Median citations per article: {citations_per_article.median():.2f}")

print("\nArticles with most citations:")
print(citations_per_article.head(10))

print("\nDistribution of citations per article:")
print(citations_per_article.value_counts().sort_index().head(10))


=== CITATIONS PER ARTICLE ===
Min citations per article: 1
Max citations per article: 32
Mean citations per article: 2.04
Median citations per article: 1.00

Articles with most citations:
article_id
10.3390_v11060565               32
10.1038_s41396-020-00885-8      31
10.1128_spectrum.00422-24       29
10.1371_journal.pone.0159387    27
10.1371_journal.pone.0212669    25
10.7717_peerj.10452             25
10.1111_cas.12935               23
10.1128_JVI.01717-21            22
10.1371_journal.pcbi.1011828    22
10.1038_s41598-020-59839-x      20
Name: count, dtype: int64

Distribution of citations per article:
count
1     421
2      52
3      15
4       7
5       2
6       1
7       2
8       1
10      2
11      1
Name: count, dtype: int64


In [8]:
# How often is each dataset cited?
dataset_citation_counts = train_labels['dataset_id'].value_counts()
print("=== DATASET REUSE PATTERNS ===")
print(f"Datasets cited once: {sum(dataset_citation_counts == 1)}")
print(f"Datasets cited multiple times: {sum(dataset_citation_counts > 1)}")
print(f"Most cited datasets:")
print(dataset_citation_counts.head(10))

# Check if the same dataset can have different types in different papers
dataset_type_combinations = train_labels.groupby('dataset_id')['type'].nunique()
multi_type_datasets = dataset_type_combinations[dataset_type_combinations > 1]
print(f"\nDatasets that appear with different types: {len(multi_type_datasets)}")
if len(multi_type_datasets) > 0:
    print("Examples:")
    for dataset in multi_type_datasets.index[:5]:
        types = train_labels[train_labels['dataset_id'] == dataset]['type'].unique()
        print(f"  {dataset}: {list(types)}")


=== DATASET REUSE PATTERNS ===
Datasets cited once: 1066
Datasets cited multiple times: 0
Most cited datasets:
dataset_id
https://doi.org/10.17882/49388           1
PRJNA167259                              1
https://doi.org/10.5061/dryad.m4r46      1
https://doi.org/10.5061/dryad.27m63      1
https://doi.org/10.5061/dryad.27m63.1    1
https://doi.org/10.5061/dryad.27m63.2    1
https://doi.org/10.5061/dryad.kh186      1
CP013147                                 1
PRJNA10687                               1
PRJNA16146                               1
Name: count, dtype: int64

Datasets that appear with different types: 0


Let’s now examine a specific PDF to understand the content:

In [9]:
import pymupdf

def examine_pdf_content(article_id):
    """Examine the content of a PDF file"""
    pdf_path = f"/kaggle/input/make-data-count-finding-data-references/train/PDF/{article_id}.pdf"
    
    if not os.path.exists(pdf_path):
        print(f"PDF not found: {pdf_path}")
        return
    
    try:
        doc = pymupdf.open(pdf_path)
        print(f"=== PDF CONTENT FOR {article_id} ===")
        print(f"Number of pages: {len(doc)}")
        
        # Get text from first few pages
        full_text = ""
        for page_num in range(min(3, len(doc))):  # First 3 pages max
            page = doc[page_num]
            full_text += page.get_text()
        
        print(f"Total text length: {len(full_text)} characters")
        print("\nFirst 1000 characters:")
        print("=" * 50)
        print(full_text[:1000])
        print("=" * 50)
        
        # Look for citations in this paper
        paper_citations = train_labels[train_labels['article_id'] == article_id]
        print(f"\nKnown citations for this paper ({len(paper_citations)}):")
        for _, row in paper_citations.iterrows():
            print(f"  - {row['dataset_id']} ({row['type']})")
        
        doc.close()
        
    except Exception as e:
        print(f"Error reading PDF {article_id}: {e}")

# Examine a sample article
if len(train_labels) > 0:
    sample_article = train_labels['article_id'].iloc[0]
    examine_pdf_content(sample_article)
    
    # Examine another article with multiple citations
    article_with_most_citations = train_labels['article_id'].value_counts().index[0]
    if article_with_most_citations != sample_article:
        print("\n" + "="*60)
        examine_pdf_content(article_with_most_citations)


=== PDF CONTENT FOR 10.1002_2017jc013030 ===
Number of pages: 22
Total text length: 14559 characters

First 1000 characters:
RESEARCH ARTICLE
10.1002/2017JC013030
Assessing the Variability in the Relationship Between the
Particulate Backscattering Coefficient and the Chlorophyll a
Concentration From a Global Biogeochemical-Argo Database
Marie Barbieux1
, Julia Uitz1, Annick Bricaud1, Emanuele Organelli1,2
, Antoine Poteau1
,
Catherine Schmechtig3
, Bernard Gentili1, Grigor Obolensky4, Edouard Leymarie1
,
Christophe Penkerc’h1, Fabrizio D’Ortenzio1
, and Herve Claustre1
1Sorbonne Universites, UPMC Univ Paris 06, CNRS, Observatoire Oceanologique de Villefranche, Laboratoire
d’Oceanographie de Villefranche, Villefranche-sur-Mer, France, 2Plymouth Marine Laboratory, Prospect Place, The Hoe,
Plymouth, United Kingdom, 3OSU Ecce Terra, UMS 3455, CNRS and Universite Pierre et Marie Curie, Paris 6, Paris, France,
4ERIC Euro-Argo, 29280 Plouzane, France
Abstract Characterizing phytoplankto

Let’s also check the DOI formats and make sure we understand the submission requirements:





In [10]:
def analyze_doi_formats():
    """Analyze DOI formats in the dataset"""
    # Filter for Full URL category since DOIs are URLs
    full_urls = train_labels[train_labels['dataset_category'] == 'Full URL']['dataset_id']
    
    # Further filter for actual DOIs within Full URLs
    dois = full_urls[full_urls.str.contains('doi.org', case=False, na=False)]
    
    print("=== DOI FORMAT ANALYSIS ===")
    print(f"Total Full URLs: {len(full_urls)}")
    print(f"Total DOIs (within Full URLs): {len(dois)}")
    
    if len(dois) == 0:
        print("No DOIs found in the dataset.")
        return
    
    # Check different DOI formats
    doi_formats = {
        'https://doi.org/': sum(dois.str.startswith('https://doi.org/', na=False)),
        'http://doi.org/': sum(dois.str.startswith('http://doi.org/', na=False)),
        'doi:': sum(dois.str.startswith('doi:', na=False)),
        'Other DOI format': len(dois) - sum(dois.str.startswith(('https://doi.org/', 'http://doi.org/', 'doi:'), na=False))
    }
    
    print("DOI format distribution:")
    for format_name, count in doi_formats.items():
        if len(dois) > 0:  # Prevent division by zero
            percentage = count/len(dois)*100
            print(f"  {format_name:20s}: {count:4d} ({percentage:.1f}%)")
        else:
            print(f"  {format_name:20s}: {count:4d} (0.0%)")
    
    print("\nSample DOIs by format:")
    for format_name in doi_formats:
        if doi_formats[format_name] > 0:
            # Get sample DOIs of this format
            if format_name == 'https://doi.org/':
                sample = dois[dois.str.startswith('https://doi.org/', na=False)].head(3)
            elif format_name == 'http://doi.org/':
                sample = dois[dois.str.startswith('http://doi.org/', na=False)].head(3)
            elif format_name == 'doi:':
                sample = dois[dois.str.startswith('doi:', na=False)].head(3)
            else:  # Other DOI format
                mask = ~dois.str.startswith(('https://doi.org/', 'http://doi.org/', 'doi:'), na=False)
                sample = dois[mask].head(3)
            
            print(f"  {format_name}:")
            for doi in sample:
                print(f"    {doi}")

analyze_doi_formats()


=== DOI FORMAT ANALYSIS ===
Total Full URLs: 668
Total DOIs (within Full URLs): 668
DOI format distribution:
  https://doi.org/    :  668 (100.0%)
  http://doi.org/     :    0 (0.0%)
  doi:                :    0 (0.0%)
  Other DOI format    :    0 (0.0%)

Sample DOIs by format:
  https://doi.org/:
    https://doi.org/10.17882/49388
    https://doi.org/10.5517/ccdc.csd.cc1npvt0
    https://doi.org/10.5517/ccdc.csd.cc24wxqp


In [11]:
def detailed_dataset_analysis():
    """Detailed analysis of dataset ID patterns"""
    print("=== DETAILED DATASET ID ANALYSIS ===")
    
    # Look at the 'Other' category more closely
    other_datasets = train_labels[train_labels['dataset_category'] == 'Other']['dataset_id']
    print(f"\nTotal 'Other' category datasets: {len(other_datasets)}")
    
    # Try to categorize the 'Other' datasets further
    epi_count = sum(other_datasets.str.startswith('EPI', na=False))
    prj_count = sum(other_datasets.str.startswith('PRJ', na=False))
    pdb_count = sum(other_datasets.str.startswith('PDB', na=False))
    chembl_count = sum(other_datasets.str.startswith('CHEMBL', na=False))
    gse_count = sum(other_datasets.str.startswith('GSE', na=False))
    cp_count = sum(other_datasets.str.startswith('CP', na=False))
    kx_count = sum(other_datasets.str.startswith('KX', na=False))
    d_count = sum(other_datasets.str.startswith('D', na=False))
    
    print("Breakdown of 'Other' category:")
    other_breakdown = {
        'EPI (GISAID)': epi_count,
        'PRJ (ENA)': prj_count,
        'PDB': pdb_count,
        'CHEMBL': chembl_count,
        'GSE (GEO)': gse_count,
        'CP (GenBank)': cp_count,
        'KX (GenBank)': kx_count,
        'D (Accession)': d_count
    }
    
    for category, count in other_breakdown.items():
        if count > 0:
            print(f"  {category:15s}: {count:3d} ({count/len(other_datasets)*100:.1f}%)")
    
    print("\nSample 'Other' datasets:")
    for i, dataset in enumerate(other_datasets.head(10), 1):
        print(f"  {i:2d}. {dataset}")

detailed_dataset_analysis()


=== DETAILED DATASET ID ANALYSIS ===

Total 'Other' category datasets: 303
Breakdown of 'Other' category:
  EPI (GISAID)   :  64 (21.1%)
  CP (GenBank)   :   3 (1.0%)
  KX (GenBank)   :   2 (0.7%)
  D (Accession)  :   1 (0.3%)

Sample 'Other' datasets:
   1. 5VA1
   2. IPR000884
   3. IPR001124
   4. IPR001577
   5. IPR002477
   6. IPR002889
   7. IPR004007
   8. IPR004302
   9. IPR004785
  10. IPR004911


In [12]:
def analyze_missing_type():
    """Analyze the 'Missing' type entries"""
    print("=== ANALYSIS OF 'MISSING' TYPE ENTRIES ===")
    missing_entries = train_labels[train_labels['type'] == 'Missing']
    
    print(f"Number of 'Missing' entries: {len(missing_entries)}")
    
    print("\nDataset IDs for 'Missing' entries:")
    for i, (_, row) in enumerate(missing_entries.head(10).iterrows(), 1):
        print(f"  {i:2d}. {row['dataset_id']}")
    
    # Check if these have a pattern
    missing_datasets = missing_entries['dataset_id']
    missing_categories = missing_datasets.apply(categorize_dataset_id)
    print("\nCategories of 'Missing' entries:")
    print(missing_categories.value_counts())

analyze_missing_type()


=== ANALYSIS OF 'MISSING' TYPE ENTRIES ===
Number of 'Missing' entries: 347

Dataset IDs for 'Missing' entries:
   1. https://doi.org/10.5517/ccdc.csd.cc1npvt0
   2. https://doi.org/10.5517/ccdc.csd.cc24wxqp
   3. https://doi.org/10.5517/ccdc.csd.cc24rrb0
   4. https://doi.org/10.5517/ccdc.csd.cc221dk3
   5. https://doi.org/10.5517/ccdc.csd.cc22c4yk
   6. https://doi.org/10.5517/ccdc.csd.cc24cjxz
   7. https://doi.org/10.5517/ccdc.csd.cc24nsk5
   8. https://doi.org/10.5517/ccdc.csd.cc24nsqb
   9. https://doi.org/10.5517/ccdc.csd.cc24d93z
  10. https://doi.org/10.5517/ccdc.csd.cc250wsw

Categories of 'Missing' entries:
dataset_id
Full URL    343
Other         4
Name: count, dtype: int64


<div style="font-family: Arial, sans-serif; background: linear-gradient(135deg, #0f5132, #145a44); color: #fff; padding: 20px; border-radius: 8px;">
  <h1 style="text-align: center; margin-top: 0;">📊  Data Summary</h1>

  <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 6px; margin-bottom: 15px;">
    <h2>📦 Dataset Overview</h2>
    <ul>
      <li><b>Shape:</b> 1066 rows × 3 columns</li>
      <li><b>Columns:</b> <code>article_id</code>, <code>dataset_id</code>, <code>type</code></li>
      <li><b>Missing Values:</b> None</li>
    </ul>
  </div>

  <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 6px; margin-bottom: 15px;">
    <h2>🧾 Citation Types</h2>
    <ul>
      <li>Secondary: <b>449</b> (42.1%)</li>
      <li>Missing: <b>347</b> (32.6%)</li>
      <li>Primary: <b>270</b> (25.3%)</li>
    </ul>
  </div>

  <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 6px; margin-bottom: 15px;">
    <h2>🔗 Dataset ID Categories</h2>
    <ul>
      <li>Full URL: <b>668</b> (62.7%) — all in <code>https://doi.org/</code> format</li>
      <li>Other: <b>303</b> (28.4%)</li>
      <li>ArrayExpress: 37 (3.5%)</li>
      <li>ChEMBL: 29 (2.7%)</li>
      <li>ENA: 26 (2.4%)</li>
      <li>GEO: 3 (0.3%)</li>
    </ul>
    <p><b>Other category breakdown:</b> EPI/GISAID 64, CP/GenBank 3, KX/GenBank 2, D/Accession 1</p>
  </div>

  <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 6px; margin-bottom: 15px;">
    <h2>📄 Article Stats</h2>
    <ul>
      <li>Total unique articles: <b>523</b></li>
      <li>Citations per article — Min: 1, Max: 32, Mean: 2.04, Median: 1</li>
      <li>Most cited: <code>10.3390_v11060565</code> (32 citations)</li>
    </ul>
  </div>

  <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 6px;">
    <h2>📂 Directory Info</h2>
    <ul>
      <li>Train dir files: 2 (XML + PDF, no extensions detected)</li>
      <li>Test dir files: 2 (XML + PDF, no extensions detected)</li>
    </ul>
  </div>
</div>
