In [5]:
# !pip install scikit-learn
import ast
import pandas as pd
import numpy as np
from bh24_literature_mining.machine_learning_tools import (
    convert_to_IOB_format_from_df,
    check_integrity_of_files,
)
from bh24_literature_mining.biotools import get_biotools
from bh24_literature_mining.europepmc_api import identify_tool_mentions_using_europepmc
from datetime import datetime

from sklearn.model_selection import train_test_split
from pathlib import Path

In [6]:
random_state = np.random.RandomState(np.random.seed(None))
p = Path().cwd()

In [7]:
# biotools = get_biotools("biotoolspub/biotoolspub_with_topic.tsv")

# tool_occurrences_df = identify_tool_mentions_using_europepmc(
#     biotools[0:2000], article_limit=3
# )
# p_out = p / "data"
# current_date = datetime.now().strftime("%y%m%d")
# tool_occurrences_df.to_csv(
#     p_out / f"{current_date}_mentions_with_topics.csv", index=False
# )

Some of the date was annotated to used for training. biotools[0:100] were used for annotation as of 07-08-2025 ("data/annotated/250227mentions.csv and data/annotated/250227mentions_extra.csv) and biotools[100:2000] for further model testing and additional annotation (annotation in process as of 07-08-2025).

In [8]:
path_sheet_mentions = p / "data/annotated/251105_annotated.csv"
df = pd.read_csv(path_sheet_mentions)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7664 entries, 0 to 7663
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PMCID            7664 non-null   object 
 1   Sentence         7664 non-null   object 
 2   NER_Tags         7664 non-null   object 
 3   True?            7663 non-null   object 
 4   False?           7662 non-null   object 
 5   NER_Model_Found  5972 non-null   object 
 6   Topics           7605 non-null   object 
 7   Unnamed: 7       0 non-null      float64
 8   Unnamed: 8       0 non-null      float64
 9   Unnamed: 9       0 non-null      float64
 10  Checked?         7664 non-null   object 
dtypes: float64(3), object(8)
memory usage: 658.8+ KB


In [10]:
df.head(5)

Unnamed: 0,PMCID,Sentence,NER_Tags,True?,False?,NER_Model_Found,Topics,Unnamed: 7,Unnamed: 8,Unnamed: 9,Checked?
0,PMC11127412,"Here, we present adaptations of MaBoSS [15, 16...","(32, 38, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 32-38,"Systems biology, Statistics and probability",,,,True
1,PMC11127412,"For optimizing the statistics aggregation, MaB...","(43, 49, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 43-49; MaBoSS (BT) at 143-149,"Systems biology, Statistics and probability",,,,True
2,PMC11127412,"For optimizing the statistics aggregation, MaB...","(143, 149, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 43-49; MaBoSS (BT) at 143-149,"Systems biology, Statistics and probability",,,,True
3,PMC11127412,MaBoSS was initially developed as a single-cor...,"(0, 6, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 0-6,"Systems biology, Statistics and probability",,,,True
4,PMC12084383,UPMaBoSS40 produces dynamic populations of int...,"(2, 8, 'MaBoSS', 'maboss')",True,False,UPMaBoSS40 (BT) at 0-10,"Systems biology, Statistics and probability",,,,True


In [11]:
# Making sure we do not have both unchedecked True and False
df.loc[df["False?"] == True, "NER_Tags"] = None
true_checked_df = df[(df["True?"] == True) | (df["False?"] == True)]

len(true_checked_df)

2525

In [12]:
true_checked_df = true_checked_df[["PMCID", "Sentence", "NER_Tags"]]
true_checked_df["NER_Tags"] = true_checked_df["NER_Tags"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
grouped_df = (
    true_checked_df.groupby(["Sentence", "PMCID"])["NER_Tags"]
    .apply(lambda x: [i for i in x if i is not None])
    .reset_index()
)
grouped_df.head(5)

Unnamed: 0,Sentence,PMCID,NER_Tags
0,"Edited by: Qingqing Wei, Augusta University, ...",PMC12081422,[]
1,REAGENT or RESOURCESOURCEIDENTIFIERAntibodies...,PMC10066595,[]
2,"References Bińkowski J, Taryma-Leśniak O, Sok...",PMC11555854,"[(101, 106, eDAVE, edave)]"
3,StarPep toolbox is an open-source and user-fr...,PMC10469104,"[(1, 16, StarPep Toolbox, starpep)]"
4,Supplemental Figure S1CLIPPER 2.0 flowchart i...,PMC11192779,"[(23, 34, CLIPPER 2.0, clipper_2.0)]"


In [13]:
grouped_df["NER_Tags"] = grouped_df["NER_Tags"].apply(
    lambda x: [[item[0], item[1], item[2], "BT"] for item in x] if x else None
)
grouped_df.reset_index(drop=True, inplace=True)

In [14]:
# Avoid PMCDID duplicates in both train and test sets
grouped_df.sort_values(by="PMCID", inplace=True)
train_df, val_test_df = train_test_split(
    grouped_df, test_size=0.4, random_state=42, shuffle=False
)
val_test_df = val_test_df[~val_test_df["PMCID"].isin(train_df["PMCID"])]

val_df, test_df = train_test_split(
    val_test_df, test_size=0.5, random_state=42, shuffle=False
)

test_df = test_df[~test_df["PMCID"].isin(train_df["PMCID"])]

In [15]:
train_df = (
    train_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)
test_df = (
    test_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)

In [16]:
train_df.tail(5)

Unnamed: 0,Sentence,NER_Tags
1330,"However, as a result of extensive cytokine rel...",
1331,Imputed genotypes for 635 animals were accesse...,"[[53, 61, GLIMPSE2, BT]]"
1332,"The results predicted by DeepProSite (a), the ...","[[25, 36, DeepProSite, BT]]"
1333,elegans strain with available WGS data in CaeN...,"[[42, 48, CaeNDR, BT]]"
1334,"In contrast, Ursa stands out by encompassing a...","[[13, 17, Ursa, BT]]"


In [17]:
path = Path().cwd().parent
output_folder = path / "data/IOB/"
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
convert_to_IOB_format_from_df(train_df, output_folder, "train_IOB.tsv")
convert_to_IOB_format_from_df(val_df, output_folder, "dev_IOB.tsv")
convert_to_IOB_format_from_df(test_df, output_folder, "test_IOB.tsv")

Processing batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
train_files = [output_folder / "train_IOB.tsv"]
dev_files = [output_folder / "dev_IOB.tsv"]
test_files = [output_folder / "test_IOB.tsv"]


check_integrity_of_files(train_files, dev_files, test_files)


Checking Dataset 1:
/root/biohackathon2024/data/IOB/train_IOB.tsv is valid.
/root/biohackathon2024/data/IOB/dev_IOB.tsv is valid.
/root/biohackathon2024/data/IOB/dev_IOB.tsv is valid.
