In [71]:
# !pip install scikit-learn
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
from pathlib import Path

# Add the directory to sys.path
sys.path.append('../../biohackathon2024/src/bh24_literature_mining')

# Now import ml_tools
import ml_tools

In [72]:
# Load the data
p = Path().cwd().parent
path_sheet_mentions = p / 'data/annotated/250227mentions.csv'
path_sheet_mentions_extra = p / 'data/annotated/250227mentions_extra.csv'

# Read the sheet into a DataFrame
sheet_mentions = pd.read_csv(path_sheet_mentions)
sheet_mentions_extra = pd.read_csv(path_sheet_mentions_extra)


In [73]:
len(sheet_mentions), len(sheet_mentions_extra)

(2097, 1320)

In [74]:
# Combine the two sheets
df = pd.concat([sheet_mentions, sheet_mentions_extra], ignore_index=True)
len(df)

3417

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3417 entries, 0 to 3416
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   PMCID     3417 non-null   object
 1   Task for  2425 non-null   object
 2   Sentence  3417 non-null   object
 3   True?     3417 non-null   bool  
 4   False?    3417 non-null   bool  
 5   NER_Tags  3417 non-null   object
 6   Topics    3314 non-null   object
dtypes: bool(2), object(5)
memory usage: 140.3+ KB


In [76]:
# Display the DataFrame
df.head(5)

Unnamed: 0,PMCID,Task for,Sentence,True?,False?,NER_Tags,Topics
0,PMC11286849,Ana,The identified proteins also included 52 (70%)...,True,False,"(120, 129, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
1,PMC11286849,Ana,The localization of 52 of these 445 proteins i...,True,False,"(72, 81, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
2,PMC11286849,Ana,The 445 identified proteins were searched agai...,True,False,"(54, 63, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
3,PMC11458576,Ana,Putative nucleases were identified by searchin...,True,False,"(48, 57, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."
4,PMC11458576,Ana,Functional Enrichment Analysis (FEA) was perfo...,True,False,"(57, 66, 'SubtiWiki', 'subtiwiki')","Molecular interactions, pathways and networks,..."


In [77]:
df.loc[df['False?'] == True, 'NER_Tags'] = None
true_checked_df = df[(df['True?'] == True) | (df['False?'] == True)]

len(true_checked_df)


3376

In [78]:
true_checked_df[true_checked_df['False?'] == True]

Unnamed: 0,PMCID,Task for,Sentence,True?,False?,NER_Tags,Topics
37,PMC11473683,Ana,"Equally, 30/82 (37%) of SLiM-positive patients...",False,True,,"Ecology, Molecular interactions, pathways and ..."
38,PMC11473683,Ana,SLiM-CRAB-positive patients presented with eit...,False,True,,"Ecology, Molecular interactions, pathways and ..."
39,PMC11473683,Ana,43 patients had more than one MDE with 3 (7%) ...,False,True,,"Ecology, Molecular interactions, pathways and ..."
40,PMC11473683,Ana,43 patients had more than one MDE with 3 (7%) ...,False,True,,"Ecology, Molecular interactions, pathways and ..."
41,PMC11491431,Ana,KM curves of groups of OS from OS-all cohort (...,False,True,,"Ecology, Molecular interactions, pathways and ..."
...,...,...,...,...,...,...,...
3412,PMC11377344,,The proposed BLR model follows a Variational A...,False,True,,"Workflows, Sequencing, Sequence assembly, DNA ..."
3413,PMC11377344,,Additional details of the BLR model and VAE ap...,False,True,,"Workflows, Sequencing, Sequence assembly, DNA ..."
3414,PMC11299494,,GWAS on this panel identified 13 QTLs signific...,False,True,,"Workflows, Sequencing, Sequence assembly, DNA ..."
3415,PMC11299494,,(2018) assembled an international barley panel...,False,True,,"Workflows, Sequencing, Sequence assembly, DNA ..."


In [79]:
true_checked_df = true_checked_df[['PMCID', 'Sentence','NER_Tags']]

transformed_df = true_checked_df.rename(columns={'PMCID': 'PMCID',
    'Sentence': 'sentence',
    'NER_Tags': 'ner_ines'
})

transformed_df['ner_ines'] = transformed_df['ner_ines'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

transformed_df

Unnamed: 0,PMCID,sentence,ner_ines
0,PMC11286849,The identified proteins also included 52 (70%)...,"(120, 129, SubtiWiki, subtiwiki)"
1,PMC11286849,The localization of 52 of these 445 proteins i...,"(72, 81, SubtiWiki, subtiwiki)"
2,PMC11286849,The 445 identified proteins were searched agai...,"(54, 63, SubtiWiki, subtiwiki)"
3,PMC11458576,Putative nucleases were identified by searchin...,"(48, 57, SubtiWiki, subtiwiki)"
4,PMC11458576,Functional Enrichment Analysis (FEA) was perfo...,"(57, 66, SubtiWiki, subtiwiki)"
...,...,...,...
3412,PMC11377344,The proposed BLR model follows a Variational A...,
3413,PMC11377344,Additional details of the BLR model and VAE ap...,
3414,PMC11299494,GWAS on this panel identified 13 QTLs signific...,
3415,PMC11299494,(2018) assembled an international barley panel...,


In [80]:
# Group by 'sentence' and aggregate 'ner_ines'
grouped_df = transformed_df.groupby(['sentence', 'PMCID'])['ner_ines'].apply(lambda x: [i for i in x if i is not None]).reset_index()
grouped_df.head(5)


Unnamed: 0,sentence,PMCID,ner_ines
0,"Prana Jagannatha GN , Mendel B , Labi NPT...",PMC11317698,[]
1,"(A) Intersection of MAGMA, TWAS, PWAS in this...",PMC11443877,"[(21, 26, MAGMA, magma-pipeline)]"
2,"- ""We identify some limitations of MarkerScan...",PMC11016177,"[(36, 46, MarkerScan, markerscan), (128, 138, ..."
3,Algorithm 1A single iteration of the MaBoSS s...,PMC11127412,"[(38, 44, MaBoSS, maboss)]"
4,Author contributions: Conceptualization: Shuc...,PMC11340858,[]


In [None]:
grouped_df['ner_ines'] = grouped_df['ner_ines'].apply(
    lambda x: [[item[0], item[1], item[2], 'BT'] for item in x] if x else None
)
grouped_df.reset_index(drop=True, inplace=True)


Unnamed: 0,sentence,PMCID,ner_ines
0,"Prana Jagannatha GN , Mendel B , Labi NPT...",PMC11317698,
1,"(A) Intersection of MAGMA, TWAS, PWAS in this...",PMC11443877,"[[21, 26, MAGMA, BT]]"
2,"- ""We identify some limitations of MarkerScan...",PMC11016177,"[[36, 46, MarkerScan, BT], [128, 138, MarkerSc..."
3,Algorithm 1A single iteration of the MaBoSS s...,PMC11127412,"[[38, 44, MaBoSS, BT]]"
4,Author contributions: Conceptualization: Shuc...,PMC11340858,


In [92]:
# Avoid PMCDID leak
grouped_df.sort_values(by='PMCID', inplace=True)

# Split the data into train and test sets
train_df, test_df = train_test_split(grouped_df, test_size=0.2, random_state=42, shuffle=False)
print(len(test_df))

# Check if PMCID from train is in test, then drop those rows from test
test_df = test_df[~test_df['PMCID'].isin(train_df['PMCID'])]
print(len(test_df))


561
560


In [93]:
train_df = train_df.drop(columns=['PMCID']).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.drop(columns=['PMCID']).sample(frac=1, random_state=42).reset_index(drop=True)

In [94]:
train_df.tail(5)

Unnamed: 0,sentence,ner_ines
2239,The CadA protein 3D structure model quality wa...,"[[63, 68, PyMOL, BT]]"
2240,We performed simulations to evaluate power and...,"[[63, 72, trans-PCO, BT]]"
2241,The exclusion of indels by bcftools also leads...,"[[27, 35, BCFtools, BT]]"
2242,The five genomes for benchmarking are availabl...,"[[78, 85, MerCat2, BT]]"
2243,Response: We appreciate the reviewer’s insight...,"[[73, 80, KinCytE, BT]]"


In [None]:
test_df.head()


In [None]:
# Example usage:
output_folder = '../data/IOB/'

# Convert train, dev, and test dataframes to IOB format
ml_tools.convert_to_IOB_format_from_df(train_df, output_folder, 'train_IOB.tsv')
ml_tools.convert_to_IOB_format_from_df(test_df, output_folder, 'dev_IOB.tsv')

In [None]:
# Example usage:
train_files = [output_folder+'train_IOB.tsv']
dev_files = [output_folder+'dev_IOB.tsv']

ml_tools.check_integrity_of_files(train_files, dev_files,dev_files)