In [None]:
from sentence_transformers import SentenceTransformer

import torch

from tqdm.std import tqdm
import json
import pandas as pd
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-7B-instruct", device, trust_remote_code=True)

In [None]:
model.max_seq_length=512

## loading dataset

In [None]:
data = pd.read_csv("../document_ranking_input_true_data/document_ranking_query.tsv",sep="\t")

In [None]:
data

In [None]:
def get_score(query, documents):
    """Calculate the matching score of a single query for multiple documents"""
    with torch.no_grad():
        query_embedding = model.encode([query], prompt_name="query", convert_to_tensor=True, normalize_embeddings=True)  # (1, dim)
        document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)  # (100, dim)
        # scores = (query_embedding @ document_embeddings.T) * 100  # (1, 100) -> (100,)
        scores = model.similarity(query_embedding, document_embeddings) 
        
        del query_embedding, document_embeddings
        torch.cuda.empty_cache()
    return scores.cpu().numpy().flatten()

In [None]:
qids, pids, ranked_pids, ranked_scores = [], [], [], []

In [None]:
# Batch processing data
batch_size = 100
num_samples = len(data)

for i in tqdm(range(0, num_samples, batch_size)):
    temp_df = data.iloc[i:i+batch_size]
    
    # Since the 100 queries are the same, only the first one is encoded
    query = temp_df["query"].iloc[0]
    passages = temp_df["passage"].tolist()
    query_id = temp_df["qid"].iloc[0]  # 100 qids are the same, take the first one
    passage_ids = temp_df["docid"].to_numpy() 

    scores = get_score(query, passages)  

    # NumPy sorting, speed up
    sorted_indices = np.argsort(-scores)  # Sort index in descending order
    sorted_pids_batch = passage_ids[sorted_indices]  
    sorted_scores_batch = scores[sorted_indices]  

    
    qids.extend([query_id] * batch_size)  
    pids.extend(passage_ids)  
    ranked_pids.extend(sorted_pids_batch)  
    ranked_scores.extend(sorted_scores_batch)  

In [None]:
df = pd.DataFrame({"qid":qids, "pid":pids, "ranked_pid":ranked_pids, "scores":ranked_scores})

In [None]:
df.to_csv("../passage_output_result/qwen2_result.tsv",sep="\t",index=False)