In [1]:
pip install pandas lxml xgboost scikit-learn pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np
import re
from lxml import etree
import fitz  # PyMuPDF
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# === Paths ===
DATA_DIR = "/kaggle/input/make-data-count-finding-data-references"
TRAIN_LABELS = f"{DATA_DIR}/train_labels.csv"
TRAIN_DIR = f"{DATA_DIR}/train"
TEST_DIR = f"{DATA_DIR}/test"

# === Load labels ===
labels = pd.read_csv(TRAIN_LABELS)

# === Parse XML ===
def parse_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        return ' '.join(tree.xpath('//text()'))
    except:
        return ""

# === Parse PDF ===
def parse_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except:
        return ""

# === Feature extraction ===
def extract_features(row, base_dir):
    article_id = row['article_id']
    xml_path = os.path.join(base_dir, f"{article_id}.xml")
    pdf_path = os.path.join(base_dir, f"{article_id}.pdf")
    
    xml_text = parse_xml(xml_path)
    pdf_text = parse_pdf(pdf_path)
    full_text = (xml_text or "") + "\n" + (pdf_text or "")
    
    features = {
        "has_doi": int(bool(re.search(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', full_text, re.I))),
        "has_accession": int(bool(re.search(r'\b[A-Z]{2,}_?\d{4,}\b', full_text))),
        "has_url": int(bool(re.search(r'https?://\S+', full_text))),
        "downloaded_from": int("downloaded from" in full_text.lower()),
        "numeric_count": len(re.findall(r'\d{4,}', full_text)),
        "mention_count": len(re.findall(r'dataset|data set|accession|10\.', full_text.lower()))
    }
    return pd.Series(features)

# === Apply feature extraction ===
features_df = labels.apply(lambda row: extract_features(row, TRAIN_DIR), axis=1)
data = pd.concat([labels, features_df], axis=1)
data['target'] = data['type'].map({'Primary': 0, 'Secondary': 1})
data['target'] = data['type'].map({'Primary': 0, 'Secondary': 1})
data = data.dropna(subset=['target'])
data['target'] = data['target'].astype(int)

# === Model training ===
X = data.drop(columns=['article_id', 'dataset_id', 'type', 'target'])
y = data['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred, target_names=['Primary', 'Secondary']))

# === Prepare test set ===
test_files = os.listdir(TEST_DIR)
test_articles = sorted(set([f.split('.')[0] for f in test_files]))
test_df = pd.DataFrame([(aid, '') for aid in test_articles], columns=['article_id', 'dataset_id'])

test_features = test_df.apply(lambda row: extract_features(row, TEST_DIR), axis=1)
test_X = test_features

# === Predict on test ===
test_preds = model.predict(test_X)
test_df['type'] = ['Primary' if p == 0 else 'Secondary' for p in test_preds]
test_df['row_id'] = test_df.index

# === Save submission ===
submission = test_df[['row_id', 'article_id', 'dataset_id', 'type']]
submission.to_csv("submission.csv", index=False)

              precision    recall  f1-score   support

     Primary       0.00      0.00      0.00        43
   Secondary       0.70      1.00      0.82       101

    accuracy                           0.70       144
   macro avg       0.35      0.50      0.41       144
weighted avg       0.49      0.70      0.58       144



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
