In [1]:
# !pip install scikit-learn
import ast
import pandas as pd
import numpy as np
from bh24_literature_mining.machine_learning_tools import (
    convert_to_IOB_format_from_df,
    check_integrity_of_files,
)
from bh24_literature_mining.biotools import get_biotools
from bh24_literature_mining.europepmc_api import identify_tool_mentions_using_europepmc
from datetime import datetime

from sklearn.model_selection import train_test_split
from pathlib import Path

[nltk_data] Downloading package punkt to
[nltk_data]     /home/t.afanasyeva/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
random_state = np.random.RandomState(np.random.seed(None))
p = Path().cwd()

In [5]:
path_sheet_mentions = p / "data/annotated/251105_annotated.csv"
df = pd.read_csv(path_sheet_mentions)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7664 entries, 0 to 7663
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PMCID            7664 non-null   object 
 1   Sentence         7664 non-null   object 
 2   NER_Tags         7664 non-null   object 
 3   True?            7663 non-null   object 
 4   False?           7662 non-null   object 
 5   NER_Model_Found  5972 non-null   object 
 6   Topics           7605 non-null   object 
 7   Unnamed: 7       0 non-null      float64
 8   Unnamed: 8       0 non-null      float64
 9   Unnamed: 9       0 non-null      float64
 10  Checked?         7664 non-null   object 
dtypes: float64(3), object(8)
memory usage: 658.8+ KB


In [7]:
df.head(5)

Unnamed: 0,PMCID,Sentence,NER_Tags,True?,False?,NER_Model_Found,Topics,Unnamed: 7,Unnamed: 8,Unnamed: 9,Checked?
0,PMC11127412,"Here, we present adaptations of MaBoSS [15, 16...","(32, 38, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 32-38,"Systems biology, Statistics and probability",,,,True
1,PMC11127412,"For optimizing the statistics aggregation, MaB...","(43, 49, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 43-49; MaBoSS (BT) at 143-149,"Systems biology, Statistics and probability",,,,True
2,PMC11127412,"For optimizing the statistics aggregation, MaB...","(143, 149, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 43-49; MaBoSS (BT) at 143-149,"Systems biology, Statistics and probability",,,,True
3,PMC11127412,MaBoSS was initially developed as a single-cor...,"(0, 6, 'MaBoSS', 'maboss')",True,False,MaBoSS (BT) at 0-6,"Systems biology, Statistics and probability",,,,True
4,PMC12084383,UPMaBoSS40 produces dynamic populations of int...,"(2, 8, 'MaBoSS', 'maboss')",True,False,UPMaBoSS40 (BT) at 0-10,"Systems biology, Statistics and probability",,,,True


In [8]:
# Making sure we do not have both unchedecked True and False
df.loc[df["False?"] == True, "NER_Tags"] = None
true_checked_df = df[(df["True?"] == True) | (df["False?"] == True)]

len(true_checked_df)

2525

In [9]:
true_checked_df = true_checked_df[["PMCID", "Sentence", "NER_Tags"]]
true_checked_df["NER_Tags"] = true_checked_df["NER_Tags"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
grouped_df = (
    true_checked_df.groupby(["Sentence", "PMCID"])["NER_Tags"]
    .apply(lambda x: [i for i in x if i is not None])
    .reset_index()
)
grouped_df.head(5)

Unnamed: 0,Sentence,PMCID,NER_Tags
0,"Edited by: Qingqing Wei, Augusta University, ...",PMC12081422,[]
1,REAGENT or RESOURCESOURCEIDENTIFIERAntibodies...,PMC10066595,[]
2,"References Bińkowski J, Taryma-Leśniak O, Sok...",PMC11555854,"[(101, 106, eDAVE, edave)]"
3,StarPep toolbox is an open-source and user-fr...,PMC10469104,"[(1, 16, StarPep Toolbox, starpep)]"
4,Supplemental Figure S1CLIPPER 2.0 flowchart i...,PMC11192779,"[(23, 34, CLIPPER 2.0, clipper_2.0)]"


In [10]:
grouped_df["NER_Tags"] = grouped_df["NER_Tags"].apply(
    lambda x: [[item[0], item[1], item[2], "BT"] for item in x] if x else None
)
grouped_df.reset_index(drop=True, inplace=True)

In [11]:
# Avoid PMCDID duplicates in both train and test sets
grouped_df.sort_values(by="PMCID", inplace=True)
train_df, test_df = train_test_split(
    grouped_df, test_size=0.2, random_state=42, shuffle=False
)
test_df = test_df[~test_df["PMCID"].isin(train_df["PMCID"])]

Reshuffling the data using .sample fucntion


In [12]:
train_df = (
    train_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)
test_df = (
    test_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)

In [13]:
train_df.tail(5)

Unnamed: 0,Sentence,NER_Tags
1775,The disgust reactions were automatically track...,"[[55, 65, DeepLabCut, BT]]"
1776,"However, upon further examination, we observed...","[[75, 83, Meta-SNP, BT]]"
1777,"For instance, discrete multirate clocks and lo...","[[91, 98, physher, BT]]"
1778,The Nucleic Acid InfraRed Data Bank (NAIRDB) s...,
1779,"To create Prophage-DB, we identified prophages...","[[368, 373, SPIRE, BT]]"


In [14]:
path = Path().cwd().parent
output_folder = path / "data/IOB/"
output_folder.mkdir(parents=True, exist_ok=True)

In [15]:
convert_to_IOB_format_from_df(train_df, output_folder, "train_IOB.tsv")
convert_to_IOB_format_from_df(test_df, output_folder, "dev_IOB.tsv")

Processing batches: 100%|██████████| 4/4 [00:00<00:00, 43.22it/s]
Processing batches: 100%|██████████| 1/1 [00:00<00:00, 43.00it/s]


In [16]:
train_files = [output_folder / "train_IOB.tsv"]
dev_files = [output_folder / "dev_IOB.tsv"]

check_integrity_of_files(train_files, dev_files, dev_files)


Checking Dataset 1:
/home/t.afanasyeva/data/IOB/train_IOB.tsv is valid.
/home/t.afanasyeva/data/IOB/dev_IOB.tsv is valid.
/home/t.afanasyeva/data/IOB/dev_IOB.tsv is valid.
