In [None]:
from modelscope.models import Model
from modelscope.pipelines import pipeline
# Version less than 1.1 please use TextRankingPreprocessor
from modelscope.preprocessors import TextRankingTransformersPreprocessor
from modelscope.utils.constant import Tasks

In [None]:
from tqdm.std import tqdm
import json
import pandas as pd
import os

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
model_name = "damo/nlp_corom_passage-ranking_english-base"
data_path = "../passage_ranking_input_true_data/passage_ranking_query.tsv"
output_path = "../passage_output_result/CoRom_result.tsv"
batch_size = 100 

## loading data

In [None]:
dataset = pd.read_csv(data_path,sep='\t')

In [None]:
dataset

## loading model

In [None]:
model = Model.from_pretrained(model_name)
preprocessor = TextRankingTransformersPreprocessor(model.model_dir)
pipeline_ins = pipeline(task=Tasks.text_ranking, model=model, preprocessor=preprocessor)

## ranking

In [None]:
def get_scores(input_df,batch_size):
    scores = []

    query = list(set(input_df["query"].tolist()))
    candidates = input_df["passage"].tolist()
        
    for i in range(0,len(candidates),batch_size):
        passage_list = candidates[i:i+batch_size]
        input = {"source_sentence": query,
                 "sentences_to_compare": passage_list}
        score = pipeline_ins(input=input)
        scores.extend(score.get('scores'))
    return scores
        

In [None]:
chunk_size = 100
ranked_docids = []
ranked_scores = []
for i in tqdm(range(0,len(dataset),chunk_size)):
    input_df = dataset[i:i+chunk_size]
    scores = get_scores(input_df, batch_size)
    # print(scores)

    # docids = input_df["docid"].tolist()
    docids = input_df["pid"].tolist()
    
    sorted_scores_docids = sorted(zip(scores, docids), reverse=True, key=lambda x: x[0])
    sorted_scores = [score for score, docid in sorted_scores_docids]
    sorted_docids = [docid for score, docid in sorted_scores_docids]

    ranked_docids.extend(sorted_docids)
    ranked_scores.extend(sorted_scores)


In [None]:
## for document ranking
# dataset["ranked_docid"] = ranked_docids
# dataset["scores"] = ranked_scores

## for passage rankng
dataset["ranked_pid"] = ranked_docids
dataset["scores"] = ranked_scores

In [None]:
# dataset = dataset[["qid","docid","ranked_docid","scores"]]

dataset = dataset[["qid","pid","ranked_pid","scores"]]

In [None]:
dataset.to_csv(output_path,sep="\t",index=False)