In [None]:
import os
import pandas as pd
from tqdm.std import tqdm

In [None]:
data_path = "document/top1000.dev"

In [None]:
data = pd.read_csv(data_path,sep="\t", names=["qid","pid","query","passage"])

In [None]:
data.head(10)

In [None]:
len(data)

In [None]:
data_groups = data.groupby("qid").size()

In [None]:
data_groups

In [None]:
# Check if each qid corresponds to 1000 instances

invalid_qids = data_groups[data_groups > 100]
print("The following qids do not meet the criteria：")
print(invalid_qids)

In [None]:
large_qid = invalid_qids.index

In [None]:
large_qid

## qrels

In [None]:
dev_qrels = pd.read_csv("document/qrels.dev.tsv",sep="\t", names=["qid", "iteration", "pid", "relevance_score"])

In [None]:
dev_qrels

In [None]:
dev_qrels_group = dev_qrels.groupby("qid")

In [None]:
dev_qrels_group.size()

## filter all positive instance

In [None]:
filter_positive_df = pd.merge(left=data,right=dev_qrels,on=["pid","qid"],how='left')

In [None]:
filter_positive_df

In [None]:
new_data = filter_positive_df[["qid","pid","query","passage","relevance_score"]]

In [None]:
new_data.fillna(0,inplace=True)

In [None]:
new_data

In [None]:
new_data = new_data[new_data.qid.isin(large_qid)]

In [None]:
new_data.reset_index(drop=True,inplace=True)

In [None]:
new_data

In [None]:
filter_results = []

for qid, group in tqdm(new_data.groupby("qid")):
    
    ones = group[group["relevance_score"] == 1.0]
    zeros = group[group["relevance_score"] == 0.0]

    # Randomly sample the required number of rows from zeros
    required_zeros = 100 - len(ones)
    sampled_zeros = zeros.sample(n=required_zeros, random_state=42, replace=False)

    # Combine the results
    new_group = pd.concat([ones, sampled_zeros])
    filter_results.append(new_group)

In [None]:
final_df = pd.concat(filter_results).reset_index(drop=True)

In [None]:
final_df

In [None]:
final_df.to_csv("document/filtered_passage_reranking_data.tsv", sep="\t", index=False)