In [1]:
import pandas as pd
import numpy as np

# Load training labels
train_labels = pd.read_csv("/kaggle/input/make-data-count-finding-data-references/train_labels.csv")

print("=== TRAINING DATA OVERVIEW ===")
print(f"Shape: {train_labels.shape}")
print(f"Rows: {len(train_labels)}, Columns: {len(train_labels.columns)}")
print()

print("=== COLUMN NAMES ===")
print(train_labels.columns.tolist())
print()

print("=== FIRST 5 ROWS ===")
print(train_labels.head())
print()

print("=== DATA TYPES ===")
print(train_labels.dtypes)
print()

print("=== MISSING VALUES ===")
print(train_labels.isnull().sum())


=== TRAINING DATA OVERVIEW ===
Shape: (1066, 3)
Rows: 1066, Columns: 3

=== COLUMN NAMES ===
['article_id', 'dataset_id', 'type']

=== FIRST 5 ROWS ===
               article_id                                 dataset_id     type
0    10.1002_2017jc013030             https://doi.org/10.17882/49388  Primary
1  10.1002_anie.201916483  https://doi.org/10.5517/ccdc.csd.cc1npvt0  Missing
2  10.1002_anie.202005531  https://doi.org/10.5517/ccdc.csd.cc24wxqp  Missing
3  10.1002_anie.202007717  https://doi.org/10.5517/ccdc.csd.cc24rrb0  Missing
4  10.1002_chem.201902131  https://doi.org/10.5517/ccdc.csd.cc221dk3  Missing

=== DATA TYPES ===
article_id    object
dataset_id    object
type          object
dtype: object

=== MISSING VALUES ===
article_id    0
dataset_id    0
type          0
dtype: int64


In [2]:
# Let's examine the target variables
print("=== CITATION TYPES ===")
print(train_labels['type'].value_counts())
print()
print("Type distribution:")
print(train_labels['type'].value_counts(normalize=True))
print()

print("=== DATASET ID PATTERNS ===")
print(f"Total unique datasets: {train_labels['dataset_id'].nunique()}")
print("Most common datasets:")
print(train_labels['dataset_id'].value_counts().head(10))
print()

print("=== ARTICLE ID PATTERNS ===")
print(f"Total unique articles: {train_labels['article_id'].nunique()}")
print("Articles with most citations:")
print(train_labels['article_id'].value_counts().head(10))


=== CITATION TYPES ===
type
Secondary    449
Missing      347
Primary      270
Name: count, dtype: int64

Type distribution:
type
Secondary    0.421201
Missing      0.325516
Primary      0.253283
Name: proportion, dtype: float64

=== DATASET ID PATTERNS ===
Total unique datasets: 1066
Most common datasets:
dataset_id
https://doi.org/10.17882/49388           1
PRJNA167259                              1
https://doi.org/10.5061/dryad.m4r46      1
https://doi.org/10.5061/dryad.27m63      1
https://doi.org/10.5061/dryad.27m63.1    1
https://doi.org/10.5061/dryad.27m63.2    1
https://doi.org/10.5061/dryad.kh186      1
CP013147                                 1
PRJNA10687                               1
PRJNA16146                               1
Name: count, dtype: int64

=== ARTICLE ID PATTERNS ===
Total unique articles: 523
Articles with most citations:
article_id
10.3390_v11060565               32
10.1038_s41396-020-00885-8      31
10.1128_spectrum.00422-24       29
10.1371_journal.pone.01

In [3]:
# Let's categorize the types of dataset IDs we see
def categorize_dataset_id(dataset_id):
    """Categorize dataset IDs by their pattern"""
    if pd.isna(dataset_id):
        return "Missing"
    
    dataset_id = str(dataset_id)
    
    if dataset_id.startswith('http'):
        return "Full URL"
    elif dataset_id.startswith('doi:') or 'doi.org' in dataset_id or dataset_id.startswith('10.'):
        return "DOI"
    elif dataset_id.startswith('GSE'):
        return "GEO (Gene Expression Omnibus)"
    elif dataset_id.startswith('E-'):
        return "ArrayExpress"
    elif dataset_id.startswith('PRJ'):
        return "ENA (European Nucleotide Archive)"
    elif dataset_id.startswith('PDB'):
        return "PDB (Protein Data Bank)"
    elif dataset_id.startswith('CHEMBL'):
        return "ChEMBL"
    elif 'zenodo' in dataset_id.lower():
        return "Zenodo"
    elif 'figshare' in dataset_id.lower():
        return "Figshare"
    elif 'dryad' in dataset_id.lower():
        return "Dryad"
    else:
        return "Other"

# Apply categorization
train_labels['dataset_category'] = train_labels['dataset_id'].apply(categorize_dataset_id)

print("=== DATASET ID CATEGORIES ===")
category_counts = train_labels['dataset_category'].value_counts()
for category, count in category_counts.items():
    print(f"{category:30s}: {count:4d} ({count/len(train_labels)*100:.1f}%)")


=== DATASET ID CATEGORIES ===
Full URL                      :  668 (62.7%)
Other                         :  303 (28.4%)
ArrayExpress                  :   37 (3.5%)
ChEMBL                        :   29 (2.7%)
ENA (European Nucleotide Archive):   26 (2.4%)
GEO (Gene Expression Omnibus) :    3 (0.3%)


In [4]:
# Let's also examine article IDs
print("\n=== ARTICLE ID FORMATS ===")
sample_articles = train_labels['article_id'].sample(10, random_state=42)
print("Sample article IDs:")
for i, article in enumerate(sample_articles, 1):
    print(f"{i:2d}. {article}")



=== ARTICLE ID FORMATS ===
Sample article IDs:
 1. 10.1002_mp.14424
 2. 10.1590_1809-6891v21e-43578
 3. 10.1128_spectrum.00422-24
 4. 10.7554_elife.62329
 5. 10.1590_1679-78255326
 6. 10.1371_journal.pone.0220399
 7. 10.1186_s13007-019-0403-2
 8. 10.1186_s12977-015-0204-2
 9. 10.1029_2020jf005675
10. 10.1128_spectrum.00422-24


In [5]:
import os

train_dir = "/kaggle/input/make-data-count-finding-data-references/train"
test_dir = "/kaggle/input/make-data-count-finding-data-references/test"

print("=== TRAINING DIRECTORY ===")
train_files = os.listdir(train_dir)
print(f"Total files in train directory: {len(train_files)}")
print("Sample files:")
for i, file in enumerate(train_files[:10], 1):
    print(f"{i:2d}. {file}")

print("\n=== TEST DIRECTORY ===")
test_files = os.listdir(test_dir)
print(f"Total files in test directory: {len(test_files)}")
print("Sample files:")
for i, file in enumerate(test_files[:10], 1):
    print(f"{i:2d}. {file}")

# Check file extensions
train_extensions = [os.path.splitext(f)[1] for f in train_files]
test_extensions = [os.path.splitext(f)[1] for f in test_files]

print(f"\nTrain file extensions: {set(train_extensions)}")
print(f"Test file extensions: {set(test_extensions)}")


=== TRAINING DIRECTORY ===
Total files in train directory: 2
Sample files:
 1. XML
 2. PDF

=== TEST DIRECTORY ===
Total files in test directory: 2
Sample files:
 1. XML
 2. PDF

Train file extensions: {''}
Test file extensions: {''}


In [6]:
# How many citations per article?
citations_per_article = train_labels['article_id'].value_counts()
print("=== CITATIONS PER ARTICLE ===")
print(f"Min citations per article: {citations_per_article.min()}")
print(f"Max citations per article: {citations_per_article.max()}")
print(f"Mean citations per article: {citations_per_article.mean():.2f}")
print(f"Median citations per article: {citations_per_article.median():.2f}")

print("\nArticles with most citations:")
print(citations_per_article.head(10))

print("\nDistribution of citations per article:")
print(citations_per_article.value_counts().sort_index().head(10))


=== CITATIONS PER ARTICLE ===
Min citations per article: 1
Max citations per article: 32
Mean citations per article: 2.04
Median citations per article: 1.00

Articles with most citations:
article_id
10.3390_v11060565               32
10.1038_s41396-020-00885-8      31
10.1128_spectrum.00422-24       29
10.1371_journal.pone.0159387    27
10.1371_journal.pone.0212669    25
10.7717_peerj.10452             25
10.1111_cas.12935               23
10.1128_JVI.01717-21            22
10.1371_journal.pcbi.1011828    22
10.1038_s41598-020-59839-x      20
Name: count, dtype: int64

Distribution of citations per article:
count
1     421
2      52
3      15
4       7
5       2
6       1
7       2
8       1
10      2
11      1
Name: count, dtype: int64


In [7]:
# How often is each dataset cited?
dataset_citation_counts = train_labels['dataset_id'].value_counts()
print("=== DATASET REUSE PATTERNS ===")
print(f"Datasets cited once: {sum(dataset_citation_counts == 1)}")
print(f"Datasets cited multiple times: {sum(dataset_citation_counts > 1)}")
print(f"Most cited datasets:")
print(dataset_citation_counts.head(10))

# Check if the same dataset can have different types in different papers
dataset_type_combinations = train_labels.groupby('dataset_id')['type'].nunique()
multi_type_datasets = dataset_type_combinations[dataset_type_combinations > 1]
print(f"\nDatasets that appear with different types: {len(multi_type_datasets)}")
if len(multi_type_datasets) > 0:
    print("Examples:")
    for dataset in multi_type_datasets.index[:5]:
        types = train_labels[train_labels['dataset_id'] == dataset]['type'].unique()
        print(f"  {dataset}: {list(types)}")


=== DATASET REUSE PATTERNS ===
Datasets cited once: 1066
Datasets cited multiple times: 0
Most cited datasets:
dataset_id
https://doi.org/10.17882/49388           1
PRJNA167259                              1
https://doi.org/10.5061/dryad.m4r46      1
https://doi.org/10.5061/dryad.27m63      1
https://doi.org/10.5061/dryad.27m63.1    1
https://doi.org/10.5061/dryad.27m63.2    1
https://doi.org/10.5061/dryad.kh186      1
CP013147                                 1
PRJNA10687                               1
PRJNA16146                               1
Name: count, dtype: int64

Datasets that appear with different types: 0


Let’s now examine a specific PDF to understand the content:

In [9]:
import pymupdf

def examine_pdf_content(article_id):
    """Examine the content of a PDF file"""
    pdf_path = f"/kaggle/input/make-data-count-finding-data-references/train/PDF/{article_id}.pdf"
    
    if not os.path.exists(pdf_path):
        print(f"PDF not found: {pdf_path}")
        return
    
    try:
        doc = pymupdf.open(pdf_path)
        print(f"=== PDF CONTENT FOR {article_id} ===")
        print(f"Number of pages: {len(doc)}")
        
        # Get text from first few pages
        full_text = ""
        for page_num in range(min(3, len(doc))):  # First 3 pages max
            page = doc[page_num]
            full_text += page.get_text()
        
        print(f"Total text length: {len(full_text)} characters")
        print("\nFirst 1000 characters:")
        print("=" * 50)
        print(full_text[:1000])
        print("=" * 50)
        
        # Look for citations in this paper
        paper_citations = train_labels[train_labels['article_id'] == article_id]
        print(f"\nKnown citations for this paper ({len(paper_citations)}):")
        for _, row in paper_citations.iterrows():
            print(f"  - {row['dataset_id']} ({row['type']})")
        
        doc.close()
        
    except Exception as e:
        print(f"Error reading PDF {article_id}: {e}")

# Examine a sample article
if len(train_labels) > 0:
    sample_article = train_labels['article_id'].iloc[0]
    examine_pdf_content(sample_article)
    
    # Examine another article with multiple citations
    article_with_most_citations = train_labels['article_id'].value_counts().index[0]
    if article_with_most_citations != sample_article:
        print("\n" + "="*60)
        examine_pdf_content(article_with_most_citations)


=== PDF CONTENT FOR 10.1002_2017jc013030 ===
Number of pages: 22
Total text length: 14559 characters

First 1000 characters:
RESEARCH ARTICLE
10.1002/2017JC013030
Assessing the Variability in the Relationship Between the
Particulate Backscattering Coefficient and the Chlorophyll a
Concentration From a Global Biogeochemical-Argo Database
Marie Barbieux1
, Julia Uitz1, Annick Bricaud1, Emanuele Organelli1,2
, Antoine Poteau1
,
Catherine Schmechtig3
, Bernard Gentili1, Grigor Obolensky4, Edouard Leymarie1
,
Christophe Penkerc’h1, Fabrizio D’Ortenzio1
, and Herve Claustre1
1Sorbonne Universites, UPMC Univ Paris 06, CNRS, Observatoire Oceanologique de Villefranche, Laboratoire
d’Oceanographie de Villefranche, Villefranche-sur-Mer, France, 2Plymouth Marine Laboratory, Prospect Place, The Hoe,
Plymouth, United Kingdom, 3OSU Ecce Terra, UMS 3455, CNRS and Universite Pierre et Marie Curie, Paris 6, Paris, France,
4ERIC Euro-Argo, 29280 Plouzane, France
Abstract Characterizing phytoplankto