In [1]:
# !pip install scikit-learn
import ast
import pandas as pd
import numpy as np
from bh24_literature_mining.machine_learning_tools import (convert_to_IOB_format_from_df, 
                                                           check_integrity_of_files)
from sklearn.model_selection import train_test_split
from pathlib import Path

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
random_state = np.random.RandomState(np.random.seed(None))

In [3]:
# Load the data
p = Path().cwd().parent
path_sheet_mentions = p / "data/annotated/250227mentions.csv"
path_sheet_mentions_extra = p / "data/annotated/250227mentions_extra.csv"

# Read the sheet into a DataFrame
sheet_mentions = pd.read_csv(path_sheet_mentions)
sheet_mentions_extra = pd.read_csv(path_sheet_mentions_extra)
df = pd.concat([sheet_mentions, sheet_mentions_extra], ignore_index=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   PMCID     3417 non-null   object
 1   Task for  2425 non-null   object
 2   Sentence  3417 non-null   object
 3   True?     3417 non-null   bool  
 4   False?    3417 non-null   bool  
 5   NER_Tags  3417 non-null   object
 6   Topics    3314 non-null   object
dtypes: bool(2), object(5)
memory usage: 140.3+ KB


In [5]:
df.head(5)

Unnamed: 0,PMCID,Task for,Sentence,True?,False?,NER_Tags,Topics
0,PMC11286849,Ana,The identified proteins also included 52 (70%)...,True,False,"(120, 129, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
1,PMC11286849,Ana,The localization of 52 of these 445 proteins i...,True,False,"(72, 81, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
2,PMC11286849,Ana,The 445 identified proteins were searched agai...,True,False,"(54, 63, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
3,PMC11458576,Ana,Putative nucleases were identified by searchin...,True,False,"(48, 57, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
4,PMC11458576,Ana,Functional Enrichment Analysis (FEA) was perfo...,True,False,"(57, 66, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."


In [6]:
# Making sure we do not have both unchedecked True and False
df.loc[df["False?"] == True, "NER_Tags"] = None
true_checked_df = df[(df["True?"] == True) | (df["False?"] == True)]

len(true_checked_df)

3376

In [7]:
true_checked_df = true_checked_df[["PMCID", "Sentence", "NER_Tags"]]
true_checked_df["NER_Tags"] = true_checked_df["NER_Tags"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
grouped_df = (
    true_checked_df.groupby(["Sentence", "PMCID"])["NER_Tags"]
    .apply(lambda x: [i for i in x if i is not None])
    .reset_index()
)
grouped_df.head(5)

Unnamed: 0,Sentence,PMCID,NER_Tags
0,"Prana Jagannatha GN , Mendel B , Labi NPT...",PMC11317698,[]
1,"(A) Intersection of MAGMA, TWAS, PWAS in this...",PMC11443877,"[(21, 26, MAGMA, magma-pipeline)]"
2,"- ""We identify some limitations of MarkerScan...",PMC11016177,"[(36, 46, MarkerScan, markerscan), (128, 138, ..."
3,Algorithm 1A single iteration of the MaBoSS s...,PMC11127412,"[(38, 44, MaBoSS, maboss)]"
4,Author contributions: Conceptualization: Shuc...,PMC11340858,[]


In [8]:
grouped_df["NER_Tags"] = grouped_df["NER_Tags"].apply(
    lambda x: [[item[0], item[1], item[2], "BT"] for item in x] if x else None
)
grouped_df.reset_index(drop=True, inplace=True)

In [9]:
# Avoid PMCDID duplicates in both train and test sets
grouped_df.sort_values(by="PMCID", inplace=True)
train_df, test_df = train_test_split(
    grouped_df, test_size=0.2, random_state=42, shuffle=False
)
test_df = test_df[~test_df["PMCID"].isin(train_df["PMCID"])]

In [10]:
train_df = (
    train_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)
test_df = (
    test_df.drop(columns=["PMCID"])
    .sample(frac=1, random_state=random_state)
    .reset_index(drop=True)
)

In [11]:
train_df.tail(5)

Unnamed: 0,Sentence,NER_Tags
2239,By incorporating several multi-omics integrati...,"[[65, 75, DriverDBv4, BT]]"
2240,vcfdist segments contigs into independent supe...,"[[113, 118, BiWFA, BT]]"
2241,Some of the rodent-borne hantaviruses have bee...,
2242,The EPIK-P1 study did not collect data on spec...,
2243,"In rnaCrosslinkOO, an adjacency matrix is crea...","[[3, 17, rnaCrosslinkOO, BT]]"


In [12]:
path = Path().cwd().parent
output_folder = path / "data/IOB/"
output_folder.mkdir(parents=True, exist_ok=True)

In [13]:
convert_to_IOB_format_from_df(train_df, output_folder, "train_IOB.tsv")
convert_to_IOB_format_from_df(test_df, output_folder, "dev_IOB.tsv")

Processing batches: 100%|██████████| 5/5 [00:00<00:00, 66.83it/s]
Processing batches: 100%|██████████| 2/2 [00:00<00:00, 181.57it/s]


In [14]:
train_files = [output_folder / "train_IOB.tsv"]
dev_files = [output_folder / "dev_IOB.tsv"]

check_integrity_of_files(train_files, dev_files, dev_files)


Checking Dataset 1:
/root/biohackathon2024/data/IOB/train_IOB.tsv is valid.
/root/biohackathon2024/data/IOB/dev_IOB.tsv is valid.
/root/biohackathon2024/data/IOB/dev_IOB.tsv is valid.
