In [6]:
import pandas as pd
import ast  
df = pd.read_csv(r'/home/ayushk/making-ms2-dataset/dataset/masked-background_PICO_0.15.csv')

new_rows = []

for _, row in df.iterrows():

    pmids = ast.literal_eval(row['PMID'])
    candidate_docs = ast.literal_eval(row['Candidate document'])
    
    for pmid, candidate_doc in zip(pmids, candidate_docs):
        new_row = row.copy()
        new_row['PMID'] = pmid
        new_row['Candidate document'] = candidate_doc
        new_rows.append(new_row)

df_exploded = pd.DataFrame(new_rows).reset_index(drop=True)

print(df_exploded.head())
df_exploded.to_csv(r'/home/ayushk/making-ms2-dataset/dataset/result-preprocessing/masked-background_PICO_0.15_seperated.csv', index=False)

   ReviewID      PMID                                  Masked Background  \
0  30760312  23984728   background despite significant progress in dr...   
1  30760312  27400308   background despite significant progress in dr...   
2  30760312   4230647   background despite significant progress in dr...   
3  19588356   8532025   background pulmonary arterial hypertension is...   
4  19588356  10790348   background pulmonary arterial hypertension is...   

                                  Candidate document  \
0  BACKGROUND Current therapies for pulmonary art...   
1  BACKGROUND High pulmonary vascular resistance ...   
2  Background Systematic Review s ( SRs ) of expe...   
3  BACKGROUND Primary pulmonary hypertension is a...   
4  BACKGROUND Patients with precapillary pulmonar...   

                                              Target  \
0  Conclusions SC therapy is effective for PAH in...   
1  Conclusions SC therapy is effective for PAH in...   
2  Conclusions SC therapy is effective

<strong>Now, i am getting r1_r,r1_p,r2_r,r2_p,rl_r	and rl_p. </strong>

In [None]:
import pandas as pd
from rouge import Rouge 
def get_rouge(candidate, reference):
    rouge = Rouge()
    try:
        scores = rouge.get_scores(candidate, reference, avg='avg')
        scores = {f"R{k1[-1]}_{k2}": v2 for k1, v1 in scores.items() for k2, v2 in v1.items() if k2 != 'f'}
    except:
        scores = {f"R{k1[-1]}_{k2}": None for k1, v1 in scores.items() for k2, v2 in v1.items() if k2 != 'f'}
    return scores
r1_r = []
r1_p = []
r2_r = []
r2_p = []
rl_r = []
rl_p = []

for index, row in df_exploded.iterrows():
    candidate = row['Candidate document']
    target = row['Target']
    scores = get_rouge(candidate, target)

    r1_r.append(scores.get('R1_r', None))
    r1_p.append(scores.get('R1_p', None))
    r2_r.append(scores.get('R2_r', None))
    r2_p.append(scores.get('R2_p', None))
    rl_r.append(scores.get('Rl_r', None))
    rl_p.append(scores.get('Rl_p', None))

df_exploded['R1_r'] = r1_r
df_exploded['R1_p'] = r1_p
df_exploded['R2_r'] = r2_r
df_exploded['R2_p'] = r2_p
df_exploded['Rl_r'] = rl_r
df_exploded['Rl_p'] = rl_p

df_exploded.to_csv(r'/home/ayushk/making-ms2-dataset/dataset/result-preprocessing/validation__PICO_0.15_value_seperated_rouge_score.csv', index=False)


In [8]:
df_exploded.drop('input_txt',axis=1,inplace=True)

In [10]:
def cal_score(row, lamb=0.15):
    f1_score = {}
    for r in ['R1', 'R2']:
        div = row[f'{r}_p']+row[f'{r}_r']
        if div==0: f1_score[r]=0
        else: f1_score[r] = 2*(row[f'{r}_p']*row[f'{r}_r']) / div
    score = (lamb*f1_score['R1']) + f1_score['R2']
    return score
df_exploded['final_score'] = df_exploded.apply(cal_score, axis=1)

df_exploded.to_csv('/home/ayushk/making-ms2-dataset/dataset/result-preprocessing/masked-background_PICO_0.15_final.csv',index=False)

In [11]:
df_exploded.drop('R1_r',axis=1,inplace=True)
df_exploded.drop('R2_r',axis=1,inplace=True)
df_exploded.drop('R1_p',axis=1,inplace=True)
df_exploded.drop('R2_p',axis=1,inplace=True)
df_exploded.drop('Rl_p',axis=1,inplace=True)
df_exploded.drop('Rl_r',axis=1,inplace=True)

In [12]:
df_exploded.to_csv('/home/ayushk/making-ms2-dataset/dataset/result-preprocessing/masked-background_PICO_0.15_final.csv',index=False)

In [13]:
df_exploded.head()

Unnamed: 0,ReviewID,PMID,Masked Background,Candidate document,Target,final_score
0,30760312,23984728,background despite significant progress in dr...,BACKGROUND Current therapies for pulmonary art...,Conclusions SC therapy is effective for PAH in...,0.01399
1,30760312,27400308,background despite significant progress in dr...,BACKGROUND High pulmonary vascular resistance ...,Conclusions SC therapy is effective for PAH in...,0.015341
2,30760312,4230647,background despite significant progress in dr...,Background Systematic Review s ( SRs ) of expe...,Conclusions SC therapy is effective for PAH in...,0.045759
3,19588356,8532025,background pulmonary arterial hypertension is...,BACKGROUND Primary pulmonary hypertension is a...,There was a trend for endothelin receptor anta...,0.068713
4,19588356,10790348,background pulmonary arterial hypertension is...,BACKGROUND Patients with precapillary pulmonar...,There was a trend for endothelin receptor anta...,0.064015


In [14]:
import pandas as pd
import nltk
from transformers import AutoTokenizer

nltk.download('punkt')  

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
def combine_text(row):
    masked_background = row['Masked Background']
    candidate_document = row['Candidate document']  

    candidate_sentences = nltk.sent_tokenize(candidate_document)
    candidate_sentences_with_sep = '[SEP]'.join(candidate_sentences)
    combined_text = f"[CLS]{masked_background}[SEP]{candidate_sentences_with_sep}[SEP]"
    
    return combined_text

df_exploded['input'] = df_exploded.apply(combine_text, axis=1)

print(df_exploded.head())
df_exploded.to_csv('/home/ayushk/making-ms2-dataset/dataset/result-preprocessing/PICO_0.15_training.csv', index=False)  


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ayushk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   ReviewID      PMID                                  Masked Background  \
0  30760312  23984728   background despite significant progress in dr...   
1  30760312  27400308   background despite significant progress in dr...   
2  30760312   4230647   background despite significant progress in dr...   
3  19588356   8532025   background pulmonary arterial hypertension is...   
4  19588356  10790348   background pulmonary arterial hypertension is...   

                                  Candidate document  \
0  BACKGROUND Current therapies for pulmonary art...   
1  BACKGROUND High pulmonary vascular resistance ...   
2  Background Systematic Review s ( SRs ) of expe...   
3  BACKGROUND Primary pulmonary hypertension is a...   
4  BACKGROUND Patients with precapillary pulmonar...   

                                              Target  final_score  \
0  Conclusions SC therapy is effective for PAH in...     0.013990   
1  Conclusions SC therapy is effective for PAH in...     0.015341   