Heuristic Model for training dataset

In [1]:
import pandas as pd

# Load dataset_mentions_test
test_df = pd.read_csv("dataset_mentions_train.csv")

def normalize_doi(doi):
    """
    Ensure the dataset_id is formatted as a DOI URL: https://doi.org/...
    """
    doi = str(doi).strip()
    if doi.startswith("http://doi.org/") or doi.startswith("https://doi.org/"):
        return doi.replace("http://", "https://")
    elif doi.startswith("10."):
        return "https://doi.org/" + doi
    return doi  # fallback for non-DOI dataset_id

def get_prefix(doi):
    """
    Extract the prefix from a DOI-like string (10.xxxx)
    """
    if "/" in doi:
        return doi.split("/")[0]
    return None

# Clean dataset_id
test_df["dataset_id"] = test_df["dataset_id"].apply(normalize_doi)

# Extract prefixes
test_df["article_prefix"] = test_df["article_id"].apply(lambda x: x.split("_")[0])
test_df["dataset_prefix"] = test_df["dataset_id"].apply(lambda x: get_prefix(x.replace("https://doi.org/", "")))

# Determine type
test_df["type"] = test_df.apply(lambda row: "Primary" if row["article_prefix"] == row["dataset_prefix"] else "Secondary", axis=1)

# Create row_id
test_df.reset_index(inplace=True)
test_df.rename(columns={"index": "row_id"}, inplace=True)

# Prepare final submission DataFrame
submission_df = test_df[["row_id", "article_id", "dataset_id", "type"]]
submission_df.to_csv("submission_train.csv", index=False)

print("✅ Submission file saved as 'submission_train.csv'")


✅ Submission file saved as 'submission_train.csv'


Heuristic Model for test dataset (Submission)

In [2]:
import pandas as pd

# Load dataset_mentions_test
test_df = pd.read_csv("dataset_mentions_test.csv")

def normalize_doi(doi):
    """
    Ensure the dataset_id is formatted as a DOI URL: https://doi.org/...
    """
    doi = str(doi).strip()
    if doi.startswith("http://doi.org/") or doi.startswith("https://doi.org/"):
        return doi.replace("http://", "https://")
    elif doi.startswith("10."):
        return "https://doi.org/" + doi
    return doi  # fallback for non-DOI dataset_id

def get_prefix(doi):
    """
    Extract the prefix from a DOI-like string (10.xxxx)
    """
    if "/" in doi:
        return doi.split("/")[0]
    return None

# Clean dataset_id
test_df["dataset_id"] = test_df["dataset_id"].apply(normalize_doi)

# Extract prefixes
test_df["article_prefix"] = test_df["article_id"].apply(lambda x: x.split("_")[0])
test_df["dataset_prefix"] = test_df["dataset_id"].apply(lambda x: get_prefix(x.replace("https://doi.org/", "")))

# Determine type
test_df["type"] = test_df.apply(lambda row: "Primary" if row["article_prefix"] == row["dataset_prefix"] else "Secondary", axis=1)

# Create row_id
test_df.reset_index(inplace=True)
test_df.rename(columns={"index": "row_id"}, inplace=True)

# Prepare final submission DataFrame
submission_df = test_df[["row_id", "article_id", "dataset_id", "type"]]
submission_df.to_csv("submission.csv", index=False)

print("✅ Submission file saved as 'submission.csv'")


✅ Submission file saved as 'submission.csv'
