In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')


df = pd.read_csv(r'C:\Users\ayush\OneDrive\Desktop\preprocessing\PICO_0.15_validation.csv')
result_df = pd.DataFrame(columns=['ReviewID', 'PMID', 'sent_id', 'docs_sent', 'Target'])
result_df1 =[]

for i, row in df.iterrows():
    sentences = sent_tokenize(row['Candidate document'])
    for j, sent in enumerate(sentences):
        if sent:
            sent_id = f"{row['ReviewID']}-{row['PMID']}-{str(j).zfill(2)}"
            
            # Create the 'input' column value
            input_value = f"[CLS]{row['Masked Background']}[SEP]{sent}[SEP]"
            
            # Create the new row
            new_row = {
                'ReviewID': row['ReviewID'],
                'PMID': row['PMID'],
                'sent_id': sent_id,
                'docs_sent': sent,
                'Masked Background': row['Masked Background'],
                'Target': row['Target'],
                'input': input_value  
            }
            result_df1.append(new_row)


df_exploded = pd.DataFrame(result_df1).reset_index(drop=True)


print(df_exploded.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   ReviewID     PMID              sent_id  \
0  28514886  4274545  28514886-4274545-00   
1  28514886  4274545  28514886-4274545-01   
2  28514886  4274545  28514886-4274545-02   
3  28514886  4274545  28514886-4274545-03   
4  28514886  4274545  28514886-4274545-04   

                                           docs_sent  \
0  Background : Necrotizing enterocolitis ( NEC )...   
1  Supplementation with enteral prebiotics may re...   
2  Therefore , we compared the efficacy and safet...   
3  Methods : In a single-center r and omized cont...   
4  The incidence of suspected NEC , feeding intol...   

                                   Masked Background  \
0  necrotizing enterocolitis nec is one of the mo...   
1  necrotizing enterocolitis nec is one of the mo...   
2  necrotizing enterocolitis nec is one of the mo...   
3  necrotizing enterocolitis nec is one of the mo...   
4  necrotizing enterocolitis nec is one of the mo...   

                                              Target  \

In [2]:
import pandas as pd
from rouge import Rouge 
def get_rouge(candidate, reference):
    scores = {}  # Initialize scores as an empty dictionary
    rouge = Rouge()
    try:
        scores = rouge.get_scores(candidate, reference, avg='avg')
        scores = {f"R{k1[-1]}_{k2}": v2 for k1, v1 in scores.items() for k2, v2 in v1.items() if k2 != 'f'}
    except Exception as e:
        print('Exception:', e)
        scores = {f"R1_{metric}": None for metric in ['r', 'p']}
        scores.update({f"R2_{metric}": None for metric in ['r', 'p']})
        scores.update({f"Rl_{metric}": None for metric in ['r', 'p']})
        
    return scores

r1_r = []
r1_p = []
r2_r = []
r2_p = []
rl_r = []
rl_p = []

for index, row in df_exploded.iterrows():
    print(row['ReviewID'], row['sent_id'])
    candidate = row['docs_sent']
    target = row['Target']
    scores = get_rouge(candidate, target)

    r1_r.append(scores.get('R1_r', None))
    r1_p.append(scores.get('R1_p', None))
    r2_r.append(scores.get('R2_r', None))
    r2_p.append(scores.get('R2_p', None))
    rl_r.append(scores.get('Rl_r', None))
    rl_p.append(scores.get('Rl_p', None))

df_exploded['R1_r'] = r1_r
df_exploded['R1_p'] = r1_p
df_exploded['R2_r'] = r2_r
df_exploded['R2_p'] = r2_p
df_exploded['Rl_r'] = rl_r
df_exploded['Rl_p'] = rl_p
df_exploded.to_csv(r'C:\Users\ayush\OneDrive\Desktop\preprocessing\PICO_0.15_validation.csv', index=False)

28514886 28514886-4274545-00
28514886 28514886-4274545-01
28514886 28514886-4274545-02
28514886 28514886-4274545-03
28514886 28514886-4274545-04
28514886 28514886-4274545-05
28514886 28514886-4274545-06
28514886 28514886-4274545-07
28514886 28514886-4274545-08
28514886 28514886-4274545-09
28514886 28514886-4274545-10
28514886 28514886-17460489-00
28514886 28514886-17460489-01
28514886 28514886-17460489-02
28514886 28514886-17460489-03
28514886 28514886-17460489-04
28514886 28514886-17460489-05
28514886 28514886-17460489-06
28514886 28514886-17460489-07
28514886 28514886-17460489-08
28514886 28514886-17460489-09
28514886 28514886-17460489-10
28514886 28514886-3113290-00
28514886 28514886-3113290-01
28514886 28514886-3113290-02
28514886 28514886-3113290-03
28514886 28514886-3113290-04
28514886 28514886-3113290-05
28514886 28514886-3113290-06
28514886 28514886-3113290-07
28514886 28514886-3113290-08
18842808 18842808-15614200-00
18842808 18842808-15614200-01
18842808 18842808-15614200-02


In [3]:
def cal_score(row, lamb=0.15):
    f1_score = {}
    for r in ['R1', 'R2']:
        div = row[f'{r}_p']+row[f'{r}_r']
        if div==0: f1_score[r]=0
        else: f1_score[r] = 2*(row[f'{r}_p']*row[f'{r}_r']) / div
    score = (lamb*f1_score['R1']) + f1_score['R2']
    return score
df_exploded['final_score'] = df_exploded.apply(cal_score, axis=1)


In [4]:
print(df_exploded.head())
df_exploded.drop('R1_r',axis=1,inplace=True)
df_exploded.drop('R2_r',axis=1,inplace=True)
df_exploded.drop('R1_p',axis=1,inplace=True)
df_exploded.drop('R2_p',axis=1,inplace=True)
df_exploded.drop('Rl_p',axis=1,inplace=True)
df_exploded.drop('Rl_r',axis=1,inplace=True)
df_exploded.to_csv(r'C:\Users\ayush\OneDrive\Desktop\preprocessing\PICO_0.15_validation.csv', index=False)


   ReviewID     PMID              sent_id  \
0  28514886  4274545  28514886-4274545-00   
1  28514886  4274545  28514886-4274545-01   
2  28514886  4274545  28514886-4274545-02   
3  28514886  4274545  28514886-4274545-03   
4  28514886  4274545  28514886-4274545-04   

                                           docs_sent  \
0  Background : Necrotizing enterocolitis ( NEC )...   
1  Supplementation with enteral prebiotics may re...   
2  Therefore , we compared the efficacy and safet...   
3  Methods : In a single-center r and omized cont...   
4  The incidence of suspected NEC , feeding intol...   

                                   Masked Background  \
0  necrotizing enterocolitis nec is one of the mo...   
1  necrotizing enterocolitis nec is one of the mo...   
2  necrotizing enterocolitis nec is one of the mo...   
3  necrotizing enterocolitis nec is one of the mo...   
4  necrotizing enterocolitis nec is one of the mo...   

                                              Target  \