In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

In [36]:
def get_new_papers(labelled_data, fetched_pages) : 
    left_join = pd.merge(
        fetched_pages,
        labelled_data,
        on="url",
        how="left",
        suffixes=("", "_labelled"), 
        indicator=True,
    )
    columns_to_keep = [col for col in left_join.columns if not col.endswith("_labelled")]
    left_join = left_join[columns_to_keep]
    only_in_fetched = left_join[left_join["_merge"] == "left_only"].drop(columns=["_merge"])
    return(only_in_fetched)

def get_already_labelled_papers(original_dataset, fetched_pages):
    left_join = pd.merge(
        fetched_pages,
        original_dataset,
        on="url",
        how="left",
        suffixes=("", "_labelled"),
        indicator=True,
    )
    columns_to_keep = [col for col in left_join.columns if not col.endswith("_labelled")]
    left_join = left_join[columns_to_keep]
    already_labelled = left_join[left_join["_merge"] == "both"].drop(columns=["_merge"])
    already_labelled.drop(columns=["relevance"], inplace=True)
    return already_labelled

def get_topk_threshold(fetched_pages, top_k):
    sorted_pages = fetched_pages.sort_values(by='score', ascending=False)
    if top_k > len(sorted_pages):
        topk_score = sorted_pages.iloc[-1]['score']
    else : 
        topk_score = sorted_pages.iloc[top_k - 1]['score']
    return(topk_score)

def keep_above_threshold(thresholds,fetched_pages):
    kept = fetched_pages[fetched_pages['score'] >= thresholds]
    return(kept)

def remove_duplicates(lst):
    seen = set()
    return [x for x in lst if not (x in seen or seen.add(x))]

def sep_by_query(session_infos, fetched_pages):
    papers_by_query = {}
    queries = remove_duplicates(list(session_infos['all_queries'])[0].split(';'))
    for query in queries:
        papers = fetched_pages[fetched_pages['get_with_query'] == query]
        papers_by_query[query]=papers
    return(papers_by_query)

def get_cumulative_papers(query_dict):
    cumulative_papers = {}
    seen_papers = pd.DataFrame()  # DataFrame initial pour cumuler les papiers
    for query, papers in query_dict.items():
        # Ajouter les nouveaux papiers
        seen_papers = pd.concat([seen_papers, papers]).drop_duplicates().reset_index(drop=True)
        # Créer un DataFrame pour la query en cours avec les colonnes des DataFrames originaux
        cumulative_papers[query] = seen_papers.copy()
    return cumulative_papers

def keep_topk_by_query(session_infos, fetched_pages, top_k):
    papers_by_query = sep_by_query(session_infos, fetched_pages)
    cumulative_papers_by_query = get_cumulative_papers(papers_by_query)
    kept_papers_by_query = {}
    for query, papers in cumulative_papers_by_query.items() : 
        threshold = get_topk_threshold(papers, top_k)
        kept_papers = keep_above_threshold(threshold, papers)
        kept_papers_by_query[query]=kept_papers
    return(kept_papers_by_query)

def recall_precision(fetched_pages, labelled_dataset) : 
    labelled_data_relevants = labelled_dataset[labelled_dataset['relevance']==1]
    merged_data = pd.merge(labelled_dataset, fetched_pages, on="url", how="inner")
    merged_data['relevance'] = merged_data['relevance'].fillna(0)
    relevants = merged_data[merged_data['relevance']==1]
    recall = len(relevants) / len(labelled_data_relevants) if len(labelled_data_relevants) > 0 else 0
    precision = len(relevants) / len(merged_data) if len(merged_data) > 0 else 0
    return(recall, precision)

In [48]:
# initial_query = 'RAG_AND_"code_generation"/'
# initial_query = '"Machine_Learning"_AND_(diffusion_OR_diffusivity)_AND_(MOFs_OR_ZIFs_OR_"metal-organic_frameworks"_OR_COFs_OR_"covalent-organic_frameworks)/'
initial_query = '"metal-organic_frameworks"_AND_"material_design"_AND_"properties"/'

base_path = f'/Users/apollineguerineau/Documents/ENSAI/3A/Greece/internship/eval/results/{initial_query}'
original_dataset = pd.read_csv(base_path +'original_dataset.csv', sep=';')
labelled_new_papers_path = "/Users/apollineguerineau/Documents/ENSAI/3A/Greece/internship/eval/ML_MOF_Diffusion/crawlers_v1/new_papers_to_label.csv"
path_baseline = base_path + 'baseline/'
path_seed_query_expand = base_path + 'SeedQueryBasedTemplate__/'
path_most_relevant_expand = base_path + 'MostRelevantPagesBasedTemplate_MostRelevantPagesPromptBasedTemplate_HydeBasedTemplate/'
crawlers = {'baseline':path_baseline,
            'seed_query_expand_sim_cos':path_seed_query_expand,
            # 'seed_query_expand_sim_cos':path_seed_query_expand+'sim_cos/',
            # 'seed_query_expand_hyde_sim_cos':path_seed_query_expand+'hyde_sim_cos/',
            'most_relevant_expand':path_most_relevant_expand}

top_k = 30

In [49]:
def reform_sessions_infos(crawler_path):
    df = pd.read_csv(crawler_path +'session_infos.csv')
    session_name = str(df['session_name'].iloc[0])
    searcher = str(df['searcher'].iloc[0])
    query_expansion = str(df['query_expansion'].iloc[0])
    classifier = str(df['classifier'].iloc[0])
    queries = str(df['nb_pages_per_request'].iloc[0])
    nb_seed_pages = str(df['stop_criteria'].iloc[0])
    duration = str(df['all_queries'].iloc[0])
    hyde = str(df['threshold'].iloc[0])
    nb_fetched_pages = str(df['hyde'].iloc[0])
    cols = ['session_name', 'searcher', 'query_expansion', 'classifier', 'all_queries', 'nb_seed_pages', 'duration', 'hyde', 'nb_fetched_pages']
    line = [session_name, searcher, query_expansion, classifier, queries, nb_seed_pages, duration, hyde, nb_fetched_pages]
    new_df = pd.DataFrame([line], columns=cols)
    return(new_df)

for crawler, path in crawlers.items() : 
    df = reform_sessions_infos(path)
    df.to_csv(path+'session.csv')

In [40]:
## get all new fetched papers 

all_new_papers = []
for crawler, path in crawlers.items() : 
    print(crawler)
    fetched_pages = pd.read_csv(path + 'fetched_pages.csv')
    session_infos = pd.read_csv(path + 'session.csv')
    if crawler == 'baseline':
        all_kept_papers = fetched_pages.iloc[:100]
    else : 
        kept_papers_by_query = keep_topk_by_query(session_infos, fetched_pages, top_k)
        all_kept_papers = pd.concat(kept_papers_by_query.values(), ignore_index=True)
    new_papers = get_new_papers(original_dataset, all_kept_papers)
    all_new_papers.append(new_papers)

combined_new_papers = pd.concat(all_new_papers, ignore_index=True)
combined_new_papers = combined_new_papers.drop_duplicates(subset=["url"], keep="first")

if os.path.exists(labelled_new_papers_path):
    labelled_new_papers = pd.read_csv(labelled_new_papers_path)  
    combined_new_papers = pd.concat([combined_new_papers,labelled_new_papers], ignore_index=True)
    combined_new_papers["has_relevance"] = combined_new_papers["relevance"].notna()
    combined_new_papers = combined_new_papers.sort_values(by="has_relevance", ascending=False)
    combined_new_papers = combined_new_papers.drop_duplicates(subset=["url"])
    
columns_to_keep = ["url", "title", "description", "relevance"]
combined_new_papers = combined_new_papers.loc[:, columns_to_keep]
combined_new_papers.to_csv(base_path + "new_papers_to_label.csv", index=False)

missing_relevance = combined_new_papers[combined_new_papers['relevance'].isna()]
print(f'{len(missing_relevance)} papers to label')
    

most_relevant_expand
39 papers to label


In [None]:
## recall and precision evolution by query

labelled_new_papers = pd.read_csv(labelled_new_papers_path)
labelled_dataset = pd.concat([labelled_new_papers,original_dataset], ignore_index=True)
for crawler, path in crawlers.values() : 
    print(crawler)
    fetched_pages = pd.read_csv(path + 'fetched_pages.csv')
    session_infos = pd.read_csv(path + 'session.csv')
    if crawler == 'baseline':
        query = list(session_infos['all_queries'])[0]
        kept_papers_by_query = {query:fetched_pages.iloc[:100]}
    else : 
        kept_papers_by_query = keep_topk_by_query(session_infos, fetched_pages, top_k)
    for query, papers in kept_papers_by_query : 
        print(f'query : {query} : {recall_precision(papers, labelled_dataset)}')
    print('----------------------------------------------------')