In [1]:
import pandas as pd
import ast  
df = pd.read_csv(r'/home/ayushk/making-ms2-dataset/dataset/new_validation_PICO_0.15.csv')

new_rows = []

for _, row in df.iterrows():

    pmids = ast.literal_eval(row['PMID'])
    candidate_docs = ast.literal_eval(row['Candidate document'])
    
    for pmid, candidate_doc in zip(pmids, candidate_docs):
        new_row = row.copy()
        new_row['PMID'] = pmid
        new_row['Candidate document'] = candidate_doc
        new_rows.append(new_row)

df_exploded = pd.DataFrame(new_rows).reset_index(drop=True)

print(df_exploded.head())
df_exploded.to_csv(r'/home/ayushk/making-ms2-dataset/dataset/validation-preprocessing/validation_PICO_0.15_seperated.csv', index=False)

   ReviewID      PMID                                         Background  \
0  28514886   4274545  Necrotizing enterocolitis ( NEC ) is one of th...   
1  28514886  17460489  Necrotizing enterocolitis ( NEC ) is one of th...   
2  28514886   3113290  Necrotizing enterocolitis ( NEC ) is one of th...   
3  18842808  15614200  BACKGROUND Several clinical trials have invest...   
4  18842808  18041436  BACKGROUND Several clinical trials have invest...   

                                  Candidate document  \
0  Background : Necrotizing enterocolitis ( NEC )...   
1  Background : Premature birth results in a dela...   
2  There is some evidence that early colonization...   
3  BACKGROUND Fiber supplements added to a calori...   
4  OBJECTIVE To evaluate the benefits of glucoman...   

                                   Masked Background  \
0  necrotizing enterocolitis nec is one of the mo...   
1  necrotizing enterocolitis nec is one of the mo...   
2  necrotizing enterocolitis nec is on

<strong>Now, i am getting r1_r,r1_p,r2_r,r2_p,rl_r	and rl_p. </strong>

In [2]:
import pandas as pd
from rouge import Rouge 
def get_rouge(candidate, reference):
    rouge = Rouge()
    try:
        scores = rouge.get_scores(candidate, reference, avg='avg')
        scores = {f"R{k1[-1]}_{k2}": v2 for k1, v1 in scores.items() for k2, v2 in v1.items() if k2 != 'f'}
    except:
        scores = {f"R{k1[-1]}_{k2}": None for k1, v1 in scores.items() for k2, v2 in v1.items() if k2 != 'f'}
    return scores
r1_r = []
r1_p = []
r2_r = []
r2_p = []
rl_r = []
rl_p = []

for index, row in df_exploded.iterrows():
    candidate = row['Candidate document']
    target = row['Target']
    scores = get_rouge(candidate, target)

    r1_r.append(scores.get('R1_r', None))
    r1_p.append(scores.get('R1_p', None))
    r2_r.append(scores.get('R2_r', None))
    r2_p.append(scores.get('R2_p', None))
    rl_r.append(scores.get('Rl_r', None))
    rl_p.append(scores.get('Rl_p', None))

df_exploded['R1_r'] = r1_r
df_exploded['R1_p'] = r1_p
df_exploded['R2_r'] = r2_r
df_exploded['R2_p'] = r2_p
df_exploded['Rl_r'] = rl_r
df_exploded['Rl_p'] = rl_p

df_exploded.to_csv(r'/home/ayushk/making-ms2-dataset/dataset/validation-preprocessing/validation_PICO_0.15_seperated_rouge_score.csv', index=False)


In [4]:
def cal_score(row, lamb=0.15):
    f1_score = {}
    for r in ['R1', 'R2']:
        div = row[f'{r}_p']+row[f'{r}_r']
        if div==0: f1_score[r]=0
        else: f1_score[r] = 2*(row[f'{r}_p']*row[f'{r}_r']) / div
    score = (lamb*f1_score['R1']) + f1_score['R2']
    return score
df_exploded['final_score'] = df_exploded.apply(cal_score, axis=1)

df_exploded.to_csv(r'/home/ayushk/making-ms2-dataset/dataset/validation-preprocessing/validation_PICO_0.15_0.15_final.csv',index=False)

In [5]:
df_exploded.drop('R1_r',axis=1,inplace=True)
df_exploded.drop('R2_r',axis=1,inplace=True)
df_exploded.drop('R1_p',axis=1,inplace=True)
df_exploded.drop('R2_p',axis=1,inplace=True)
df_exploded.drop('Rl_p',axis=1,inplace=True)
df_exploded.drop('Rl_r',axis=1,inplace=True)

In [6]:
df_exploded.to_csv('/home/ayushk/making-ms2-dataset/dataset/validation-preprocessing/masked-background_PICO_0.15_final.csv',index=False)

In [7]:
df_exploded.head()

Unnamed: 0,ReviewID,PMID,Background,Candidate document,Masked Background,Target,final_score
0,28514886,4274545,Necrotizing enterocolitis ( NEC ) is one of th...,Background : Necrotizing enterocolitis ( NEC )...,necrotizing enterocolitis nec is one of the mo...,Current evidence from systematic review and me...,0.067314
1,28514886,17460489,Necrotizing enterocolitis ( NEC ) is one of th...,Background : Premature birth results in a dela...,necrotizing enterocolitis nec is one of the mo...,Current evidence from systematic review and me...,0.040107
2,28514886,3113290,Necrotizing enterocolitis ( NEC ) is one of th...,There is some evidence that early colonization...,necrotizing enterocolitis nec is one of the mo...,Current evidence from systematic review and me...,0.053074
3,18842808,15614200,BACKGROUND Several clinical trials have invest...,BACKGROUND Fiber supplements added to a calori...,background several clinical trials have invest...,The use of glucomannan did not appear to signi...,0.043907
4,18842808,18041436,BACKGROUND Several clinical trials have invest...,OBJECTIVE To evaluate the benefits of glucoman...,background several clinical trials have invest...,The use of glucomannan did not appear to signi...,0.050837


In [8]:
import pandas as pd
import nltk
from transformers import AutoTokenizer

nltk.download('punkt')  

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
def combine_text(row):
    masked_background = row['Masked Background']
    candidate_document = row['Candidate document']  

    candidate_sentences = nltk.sent_tokenize(candidate_document)
    candidate_sentences_with_sep = '[SEP]'.join(candidate_sentences)
    combined_text = f"[CLS]{masked_background}[SEP]{candidate_sentences_with_sep}[SEP]"
    
    return combined_text

df_exploded['input'] = df_exploded.apply(combine_text, axis=1)

print(df_exploded.head())
df_exploded.to_csv('/home/ayushk/making-ms2-dataset/dataset/validation-preprocessing/PICO_0.15_validation.csv', index=False)  


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ayushk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   ReviewID      PMID                                         Background  \
0  28514886   4274545  Necrotizing enterocolitis ( NEC ) is one of th...   
1  28514886  17460489  Necrotizing enterocolitis ( NEC ) is one of th...   
2  28514886   3113290  Necrotizing enterocolitis ( NEC ) is one of th...   
3  18842808  15614200  BACKGROUND Several clinical trials have invest...   
4  18842808  18041436  BACKGROUND Several clinical trials have invest...   

                                  Candidate document  \
0  Background : Necrotizing enterocolitis ( NEC )...   
1  Background : Premature birth results in a dela...   
2  There is some evidence that early colonization...   
3  BACKGROUND Fiber supplements added to a calori...   
4  OBJECTIVE To evaluate the benefits of glucoman...   

                                   Masked Background  \
0  necrotizing enterocolitis nec is one of the mo...   
1  necrotizing enterocolitis nec is one of the mo...   
2  necrotizing enterocolitis nec is on