In [1]:
import json
from tqdm import tqdm
import torch
import transformers
import os

Load all data

In [13]:
source_path = '/content/drive/MyDrive/fintech_esun/reference/'

with open(os.path.join(source_path, 'finance_data.json'), 'rb') as f_s:
    key_to_source_dict = json.load(f_s)  # 讀取參考資料文件
    corpus_dict_finance = {int(key): value for key, value in key_to_source_dict.items()}

with open(os.path.join(source_path, 'insurance_data.json'), 'rb') as f_s:
    key_to_source_dict = json.load(f_s)  # 讀取參考資料文件
    corpus_dict_insurance = {int(key): value for key, value in key_to_source_dict.items()}

with open(os.path.join(source_path, 'faq/pid_map_content.json'), 'rb') as f_s:
    key_to_source_dict = json.load(f_s)  # 讀取參考資料文件
    corpus_dict_faq = {int(key): value for key, value in key_to_source_dict.items()}

Tokenize text:


1.   Jieba
1.   reranker model/llm model
  1. ckiplab/albert-tiny-chinese
  1. BAAI/bge-reranker-v2-m3 (faster)
  1. BAAI/bge-reranker-v2.5-gemma2-lightweight (maybe better)

remark: BAAI have a package call flagembedding, but its performance are worse

remark: still need to find a way to embbeding, in case the option may become larger in private set



In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
model = transformers.AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(8194, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_f

In [5]:
def retrieve(query, source, corpus, max_length=2048):
  # reranker to improve
  best_scores = -float('inf')
  best_ans = None

  for s in source:
    inputs = tokenizer(query, corpus[int(s)], return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
      outputs = model(**inputs)
    score = outputs.logits.squeeze().item()
    if score > best_scores:
      best_scores = score
      best_ans = s

  return best_ans

In [27]:
answer_dict = {"answers": []}  # 初始化字典
question_path = '/content/drive/MyDrive/fintech_esun/reference/questions.json'
with open(question_path, 'rb') as f:
    qs_ref = json.load(f)  # 讀取問題檔案

for q_dict in tqdm(qs_ref['questions']):
  if q_dict['category'] == 'finance':
    retrived = retrieve(q_dict['query'], q_dict['source'], corpus_dict_finance)
    answer_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': retrived})
  elif q_dict['category'] == 'insurance':
    retrived = retrieve(q_dict['query'], q_dict['source'], corpus_dict_insurance)
    answer_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': retrived})
  elif q_dict['category'] == 'faq':
    faq_dict = {key: str(value) for key, value in corpus_dict_faq.items() if key in q_dict['source']}
    retrived = retrieve(q_dict['query'], q_dict['source'], faq_dict)
    answer_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': retrived})
  else:
    raise ValueError('Invalid category')

with open('/content/drive/MyDrive/fintech_esun/reference/answer.json', 'w') as f:
  json.dump(answer_dict, f, ensure_ascii=False, indent=4)

100%|██████████| 150/150 [02:16<00:00,  1.10it/s]


## Compute accuracy

In [48]:
def compute_accuracy(ans_dict):
  predictions = ans_dict['answers']
  with open(source_path+'ground_truths_example.json', 'r') as ground_file:
      ground_truths = json.load(ground_file)['ground_truths']

  # Create a dictionary for easy lookup of ground truths
  ground_truth_dict = {item['qid']: item['retrieve'] for item in ground_truths}

  # Compare the predictions with ground truths
  correct_predictions = 0
  total_questions = len(predictions)

  # insurance: 1~50, finance: 51~100, faq: 101~150
  correct_insurance = 0
  correct_finance = 0
  correct_faq = 0

  wrong_retrived = []

  for pred in predictions:
      qid = pred['qid']
      if qid in ground_truth_dict and pred['retrieve'] == ground_truth_dict[qid]:
          correct_predictions += 1
          if qid<51:
            correct_insurance += 1
          elif qid<101:
            correct_finance += 1
          else:
            correct_faq += 1
      else:
        wrong_retrived.append(qid)

  # Compute accuracy
  accuracy = correct_predictions / total_questions

  print(f"Accuracy: {accuracy:.2%} (Correct predictions: {correct_predictions}, Total questions: {total_questions})")
  print(f"Insurance: {correct_insurance/50:.2%}, Finance: {correct_finance/50:.2%} , FAQ: {correct_faq/50:.2%} " )
  return accuracy, wrong_retrived

In [28]:
acc, wrong_retrived = compute_accuracy(answer_dict)

Accuracy: 90.67% (Correct predictions: 136, Total questions: 150)
Insurance: 94.00%, Finance: 80.00% , FAQ: 98.00% 


## Show wrong predict

In [29]:
print(wrong_retrived)

[4, 11, 47, 61, 64, 67, 70, 76, 86, 92, 93, 99, 100, 135]


Wrong ground truth: 135

has two answer: 4 (186,179 both correct)

# Result

**Method:** BBAAI/bge-reranker-v2-m3 for reranking

**parameter:**
* max_length = 2048 (best result)
  * Accuracy: 90.67% (Correct predictions: 136, Total questions: 150)
  * Insurance: 94.00%, Finance: 80.00% , FAQ: 98.00%
---



## Check the rank

In [8]:
def compute_score(query, source, corpus, max_length=2048):
  # return score
  rank = {}
  for s in source:
    inputs = tokenizer(query, corpus[int(s)], return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
      outputs = model(**inputs)
    score = outputs.logits.squeeze().item()
    rank[s] = score
  rank = sorted(rank.items(), key=lambda x: x[1], reverse=True)
  return rank

In [30]:
ranked_dict = {"answers": []}  # 初始化字典
question_path = '/content/drive/MyDrive/fintech_esun/reference/questions.json'
with open(question_path, 'rb') as f:
    qs_ref = json.load(f)  # 讀取問題檔案

for q_dict in tqdm(qs_ref['questions']):
  if q_dict['qid'] in wrong_retrived:
    if q_dict['category'] == 'finance':
      ranked = compute_score(q_dict['query'], q_dict['source'], corpus_dict_finance)
      ranked_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
    elif q_dict['category'] == 'insurance':
      ranked = compute_score(q_dict['query'], q_dict['source'], corpus_dict_insurance)
      ranked_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
    elif q_dict['category'] == 'faq':
      faq_dict = {key: str(value) for key, value in corpus_dict_faq.items() if key in q_dict['source']}
      ranked = compute_score(q_dict['query'], q_dict['source'], faq_dict)
      ranked_dict['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
    else:
      raise ValueError('Invalid category')
  else:
    continue


100%|██████████| 150/150 [00:17<00:00,  8.77it/s]


In [31]:
with open(source_path+'ground_truths_example.json', 'r') as ground_file:
    ground_truths = json.load(ground_file)['ground_truths']

# Create a dictionary for easy lookup of ground truths
ground_truth_dict = {item['qid']: item['retrieve'] for item in ground_truths}
for ans in ranked_dict['answers']:
  if ans['qid'] in wrong_retrived:
    print(ground_truth_dict[ans['qid']])
    print(ans)

186
{'qid': 4, 'retrieve': [(179, 3.330322504043579), (186, 3.2995710372924805), (627, -1.3853188753128052), (178, -2.1294572353363037), (536, -2.2300963401794434), (174, -2.291411876678467)]}
7
{'qid': 11, 'retrieve': [(141, 1.4223743677139282), (7, 0.7831827402114868), (258, 0.17798812687397003), (325, -3.582822322845459), (298, -4.175981521606445), (357, -5.052556037902832), (241, -5.927553176879883)]}
620
{'qid': 47, 'retrieve': [(243, 1.9690613746643066), (620, 0.9763119220733643), (596, -2.7213735580444336), (337, -3.5743308067321777), (182, -3.5955076217651367), (536, -5.404465675354004), (476, -7.05393648147583), (179, -7.193615913391113)]}
900
{'qid': 61, 'retrieve': [(359, 1.412253737449646), (900, 0.23233062028884888), (41, 0.016947224736213684), (870, -0.10359202325344086), (951, -1.222031593322754), (70, -2.9000186920166016), (59, -3.074003219604492)]}
124
{'qid': 64, 'retrieve': [(706, 3.570023536682129), (124, 3.441195011138916), (872, 1.4133670330047607), (808, 1.213257

all in top3 (only two in 3 other in 2) except 67, 86

## Possible ways to improve perfomance:
1. try other model or method (we can combine several model (by adding scores or vote)
1. chunking top 3 and rerank again (maybe it can decrease some noise)
(Need embedding, since it have to many sentences...)
1. prepare embedding model in case the test data grows larger.



In [41]:
import re
def rerank_in_sentence(query, source, corpus, max_length=2048, top_n=3, sentence_length=1024): # chunk don't need large token size
  ranked = compute_score(query, source, corpus, max_length)
  top_ranked = ranked[:top_n]
  best_source = None
  best_score = -float('inf')
  for s in top_ranked:
    s = s[0]
    c = corpus[s]
    sentences = re.split(r'(?<=[。？！])', c)
    for chunk in sentences:
      inputs = tokenizer(query, corpus[int(s)], return_tensors="pt", padding=True, truncation=True, max_length=sentence_length).to(device)
      with torch.no_grad():
        outputs = model(**inputs)
      score = outputs.logits.squeeze().item()
      if score > best_score:
        best_score = score
        best_source = s
  return best_source


In [47]:
sententence_result = {"answers": []}  # 初始化字典
question_path = '/content/drive/MyDrive/fintech_esun/reference/questions.json'
with open(question_path, 'rb') as f:
    qs_ref = json.load(f)  # 讀取問題檔案

for q_dict in tqdm(qs_ref['questions']):
  if q_dict['category'] == 'finance':
    ranked = rerank_in_sentence(q_dict['query'], q_dict['source'], corpus_dict_finance)
    sententence_result['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
  elif q_dict['category'] == 'insurance':
    ranked = rerank_in_sentence(q_dict['query'], q_dict['source'], corpus_dict_insurance)
    sententence_result['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
  elif q_dict['category'] == 'faq':
    faq_dict = {key: str(value) for key, value in corpus_dict_faq.items() if key in q_dict['source']}
    ranked = rerank_in_sentence(q_dict['query'], q_dict['source'], faq_dict)
    sententence_result['answers'].append({'qid': q_dict['qid'], 'retrieve': ranked})
  else:
    raise ValueError('Invalid category')


100%|██████████| 150/150 [18:12<00:00,  7.28s/it]


In [50]:
compute_accuracy(sententence_result)

Accuracy: 90.00% (Correct predictions: 135, Total questions: 150)
Insurance: 90.00%, Finance: 82.00% , FAQ: 98.00% 


(0.9, [4, 10, 11, 35, 47, 61, 64, 67, 70, 76, 86, 90, 93, 99, 135])