In [1]:
!cp /kaggle/input/datasets-wheel/datasets-2.14.4-py3-none-any.whl /kaggle/working
!pip install  /kaggle/working/datasets-2.14.4-py3-none-any.whl

Processing ./datasets-2.14.4-py3-none-any.whl
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.1.0
    Uninstalling datasets-2.1.0:
      Successfully uninstalled datasets-2.1.0
Successfully installed datasets-2.14.4


In [2]:
# installing offline dependencies
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers
!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Processing ./sentence-transformers
  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=126134 sha256=aaef9bb5b59e10e13958327c98293bbb584c7cb5ed952a5d2b1e1463fdb81180
  Stored in directory: /root/.cache/pip/wheels/6c/ea/76/d9a930b223b1d3d5d6aff69458725316b0fe205b854faf1812
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
Processing /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl
Installing collected packages: blingfir

In [3]:
!mkdir -p /kaggle/working/retrive

In [4]:
DEBUG = False
if DEBUG:
    csv_path = "/kaggle/input/gte-base-context/gte-base-valid-context-970.csv"
else:
    csv_path = "/kaggle/input/kaggle-llm-science-exam/test.csv"

print(f'csv_path is {csv_path}')

csv_path is /kaggle/input/kaggle-llm-science-exam/test.csv


In [5]:
%%writefile /kaggle/working/retrive/search_document.py
import warnings
warnings.simplefilter('ignore')
import sys
import argparse
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable
import faiss
from faiss import write_index, read_index, read_VectorTransform
from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-id', type=str, default='model',)
    parser.add_argument('--sim-model', default='/kaggle/input/all-minilm-l12-v2', type=str)
    parser.add_argument('--sentence-index-path', 
                        default="/kaggle/input/wikipedia-202307-minilm/wikipedia_202307_MiniLM-L12_seq512_title_neg4096.index", 
                        type=str)
    parser.add_argument('--wiki-index-path', 
                        default="/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", 
                        type=str)
    parser.add_argument('--test-csv-path', 
                        default='/kaggle/input/kaggle-llm-science-exam/test.csv', 
                        type=str)
    parser.add_argument('--save-doc-path', 
                        default="/kaggle/working/retrive/wikipedia_file_data0.parquet", 
                        type=str)
    parser.add_argument('--pca-file', 
                        default=None, 
                        type=str)
    parser.add_argument('--device', default=0, type=int)
    parser.add_argument('--max-length', default=512, type=int)
    parser.add_argument('--batch-size', default=8, type=int)
    parser.add_argument('--num-titles', default=5, type=int)
    parser.add_argument('--query-size', default=8, type=int)
    parser.add_argument('--weight', default=1.0, type=float)
    args = parser.parse_args()
    print(f"parsed document args: {args}")
    
    trn = pd.read_csv(args.test_csv_path)#.drop("id", 1)
    trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
    ## Search using the prompt and answers to guide the search
    trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']
    
    print(f"{args.model_id}-01 load sentence model dir is {args.sim_model}!")
    model = SentenceTransformer(args.sim_model, device='cuda')
    model.max_seq_length = args.max_length
    model = model.half()
    
    ### load sentence index
    print(f"{args.model_id}-02 load sentence index path is {args.sentence_index_path}!")
    res = faiss.StandardGpuResources()
    sentence_index = read_index(args.sentence_index_path)
    sentence_index_gpu = faiss.index_cpu_to_gpu(res, 0, sentence_index)
    ## Save memory - delete sentence_index since it is no longer necessary
    del sentence_index
    _ = gc.collect()
    libc.malloc_trim(0)

    ### extract prompt embedding
    print(f"{args.model_id}-03 extract prompt embedding!")
    prompt_embeddings = model.encode(
        trn.prompt_answer_stem.values, 
        batch_size=args.batch_size, 
        device=args.device, 
        show_progress_bar=True, 
        convert_to_tensor=True, 
        normalize_embeddings=True)
    prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
    prompt_embeddings = np.asarray(prompt_embeddings.astype('float32'))
    _ = gc.collect()
    if args.pca_file:
        print('use pca.')
        pca_mat = read_VectorTransform(args.pca_file)
        prompt_embeddings = pca_mat.apply_py(prompt_embeddings)
    else:
        print('No pca.')
    search_score = []
    search_index = []
    total = prompt_embeddings.shape[0]
    for i in tqdm(range(0, total, args.query_size)):
        ss, si = sentence_index_gpu.search(prompt_embeddings[i:i+args.query_size], args.num_titles)
        search_score.append(ss)
        search_index.append(si)
    search_score = np.concatenate(search_score)
    search_index = np.concatenate(search_index)
    ## Save memory - delete sentence_index since it is no longer necessary
    del sentence_index_gpu
    del prompt_embeddings
    _ = gc.collect()
    libc.malloc_trim(0)

    df = pd.read_parquet(args.wiki_index_path,columns=['id', 'file'])
    ## Get the article and associated file location using the index
    print(f"{args.model_id}-04 get the article and associated file location using the index!")
    wikipedia_file_data = []
    for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
        scr_idx = idx
        _df = df.loc[scr_idx].copy()
        _df['prompt_id'] = i #样本ID
        _df['score'] = scr 
        wikipedia_file_data.append(_df)
    #wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
    #wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id'])
    #wikipedia_file_data = wikipedia_file_data.reset_index(drop=True)
    #wikipedia_file_data.to_parquet(args.save_doc_path)
    wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
    wikipedia_file_data['weight'] = args.weight
    wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'score']]
    wikipedia_file_data.to_parquet(f"/kaggle/working/retrive/doc_{args.sentence_index_path.split('/')[-1].split('.')[0]}.parquet")
    print(f"/kaggle/working/retrive/doc_{args.sentence_index_path.split('/')[-1].split('.')[0]}.parquet")
    
if __name__ == '__main__':
    main()

Writing /kaggle/working/retrive/search_document.py


In [6]:
#all-minilm-l12-v2-neg4096 
!python /kaggle/working/retrive/search_document.py \
--model-id 'all-minilm-l12-v2-neg4096' \
--sim-model "/kaggle/input/all-minilm-l12-v2" \
--sentence-index-path "/kaggle/input/wikipedia-202307-minilm/wikipedia_202307_MiniLM-L12_seq512_title_neg4096.index" \
--wiki-index-path "/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.77075

parsed document args: Namespace(model_id='all-minilm-l12-v2-neg4096', sim_model='/kaggle/input/all-minilm-l12-v2', sentence_index_path='/kaggle/input/wikipedia-202307-minilm/wikipedia_202307_MiniLM-L12_seq512_title_neg4096.index', wiki_index_path='/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet', test_csv_path='/kaggle/input/kaggle-llm-science-exam/test.csv', save_doc_path='/kaggle/working/retrive/wikipedia_file_data0.parquet', pca_file=None, device=0, max_length=512, batch_size=8, num_titles=3, query_size=8, weight=0.77075)
all-minilm-l12-v2-neg4096-01 load sentence model dir is /kaggle/input/all-minilm-l12-v2!
all-minilm-l12-v2-neg4096-02 load sentence index path is /kaggle/input/wikipedia-202307-minilm/wikipedia_202307_MiniLM-L12_seq512_title_neg4096.index!
all-minilm-l12-v2-neg4096-03 extract prompt embedding!
Batches: 100%|██████████████████████████████████| 25/25 [00:02<00:00,  9.48it/s]
No pca.
100%|███████████████████████████████████████████| 25/25 [00:03<00:00, 

In [7]:
#gte-large
!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-large' \
--sim-model "/kaggle/input/gte-large" \
--sentence-index-path "/kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part1.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.90725

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-large' \
--sim-model "/kaggle/input/gte-large" \
--sentence-index-path "/kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part2.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part2.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.90725

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-large' \
--sim-model "/kaggle/input/gte-large" \
--sentence-index-path "/kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part3.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part3.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.90725

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-large' \
--sim-model "/kaggle/input/gte-large" \
--sentence-index-path "/kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part4.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part4.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.90725

parsed document args: Namespace(model_id='gte-large', sim_model='/kaggle/input/gte-large', sentence_index_path='/kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part1.index', wiki_index_path='/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet', test_csv_path='/kaggle/input/kaggle-llm-science-exam/test.csv', save_doc_path='/kaggle/working/retrive/wikipedia_file_data0.parquet', pca_file=None, device=0, max_length=512, batch_size=8, num_titles=3, query_size=8, weight=0.90725)
gte-large-01 load sentence model dir is /kaggle/input/gte-large!
gte-large-02 load sentence index path is /kaggle/input/wikipedia-gte-large-index/wikipedia_gte-large_seq512_title_pos768_part1.index!
gte-large-03 extract prompt embedding!
Batches: 100%|██████████████████████████████████| 25/25 [00:06<00:00,  4.02it/s]
No pca.
100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 28.60it/s]
gte-large-04 get the article and associated file location 

In [8]:
#all-roberta-large-v1
!python /kaggle/working/retrive/search_document.py \
--model-id 'all-roberta-large-v1' \
--sim-model "/kaggle/input/all-roberta-large-v1" \
--sentence-index-path "/kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part1.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.84375

!python /kaggle/working/retrive/search_document.py \
--model-id 'all-roberta-large-v1' \
--sim-model "/kaggle/input/all-roberta-large-v1" \
--sentence-index-path "/kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part2.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part2.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.84375

!python /kaggle/working/retrive/search_document.py \
--model-id 'all-roberta-large-v1' \
--sim-model "/kaggle/input/all-roberta-large-v1" \
--sentence-index-path "/kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part3.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part3.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.84375

!python /kaggle/working/retrive/search_document.py \
--model-id 'all-roberta-large-v1' \
--sim-model "/kaggle/input/all-roberta-large-v1" \
--sentence-index-path "/kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part4.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part4.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.84375

parsed document args: Namespace(model_id='all-roberta-large-v1', sim_model='/kaggle/input/all-roberta-large-v1', sentence_index_path='/kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part1.index', wiki_index_path='/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet', test_csv_path='/kaggle/input/kaggle-llm-science-exam/test.csv', save_doc_path='/kaggle/working/retrive/wikipedia_file_data0.parquet', pca_file=None, device=0, max_length=512, batch_size=8, num_titles=3, query_size=8, weight=0.84375)
all-roberta-large-v1-01 load sentence model dir is /kaggle/input/all-roberta-large-v1!
all-roberta-large-v1-02 load sentence index path is /kaggle/input/wikipedia-all-roberta-large-v1-index/wikipedia_all-roberta-large-v1_seq512_title_pos768_part1.index!
all-roberta-large-v1-03 extract prompt embedding!
Batches: 100%|██████████████████████████████████| 25/25 [00:06<00:00,  3.99it/s]
No pca.
100%|█████████████████████████

In [9]:
#gte-base-pos1280
!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-base' \
--sim-model "/kaggle/input/gte-base" \
--sentence-index-path "/kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part1.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.91225

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-base' \
--sim-model "/kaggle/input/gte-base" \
--sentence-index-path "/kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part2.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part2.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.91225

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-base' \
--sim-model "/kaggle/input/gte-base" \
--sentence-index-path "/kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part3.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part3.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.91225

!python /kaggle/working/retrive/search_document.py \
--model-id 'gte-base' \
--sim-model "/kaggle/input/gte-base" \
--sentence-index-path "/kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part4.index" \
--wiki-index-path "/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part4.parquet" \
--test-csv-path {csv_path} \
--num-titles 3 \
--weight 0.91225

parsed document args: Namespace(model_id='gte-base', sim_model='/kaggle/input/gte-base', sentence_index_path='/kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part1.index', wiki_index_path='/kaggle/input/wikipedia-index-quarters/wiki_2023_index_part1.parquet', test_csv_path='/kaggle/input/kaggle-llm-science-exam/test.csv', save_doc_path='/kaggle/working/retrive/wikipedia_file_data0.parquet', pca_file=None, device=0, max_length=512, batch_size=8, num_titles=3, query_size=8, weight=0.91225)
gte-base-01 load sentence model dir is /kaggle/input/gte-base!
gte-base-02 load sentence index path is /kaggle/input/wikipedia-gte-base-pos1280-index/wikipedia_gte-base_seq512_title_pos1280_part1.index!
gte-base-03 extract prompt embedding!
Batches: 100%|██████████████████████████████████| 25/25 [00:01<00:00, 15.30it/s]
No pca.
100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 31.95it/s]
gte-base-04 get the article and associated file lo

In [10]:
%%writefile /kaggle/working/retrive/merge_document.py
#### merge document 
import pandas as pd

EACH_TOPK = 5
TOTAL_TOPK = 15

docs = []
save_dir = f'/kaggle/working/retrive/'
df = pd.read_parquet(f'{save_dir}/doc_wikipedia_202307_MiniLM-L12_seq512_title_neg4096.parquet')
idx = df.groupby(['prompt_id'])['score'].nlargest(EACH_TOPK).reset_index()['level_1']
df = df.loc[idx].reset_index(drop=True)
df['weight'] = 0.77075
docs.append(df)

df1 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-large_seq512_title_pos768_part1.parquet')
df2 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-large_seq512_title_pos768_part2.parquet')
df3 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-large_seq512_title_pos768_part3.parquet')
df4 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-large_seq512_title_pos768_part4.parquet')
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df['weight'] = 0.90725
idx = df.groupby(['prompt_id'])['score'].nlargest(EACH_TOPK).reset_index()['level_1']
df = df.loc[idx].reset_index(drop=True)
docs.append(df)

df1 = pd.read_parquet(f'{save_dir}/doc_wikipedia_all-roberta-large-v1_seq512_title_pos768_part1.parquet')
df2 = pd.read_parquet(f'{save_dir}/doc_wikipedia_all-roberta-large-v1_seq512_title_pos768_part2.parquet')
df3 = pd.read_parquet(f'{save_dir}/doc_wikipedia_all-roberta-large-v1_seq512_title_pos768_part3.parquet')
df4 = pd.read_parquet(f'{save_dir}/doc_wikipedia_all-roberta-large-v1_seq512_title_pos768_part4.parquet')
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df['weight'] = 0.84375
idx = df.groupby(['prompt_id'])['score'].nlargest(EACH_TOPK).reset_index()['level_1']
df = df.loc[idx].reset_index(drop=True)
docs.append(df)

df1 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-base_seq512_title_pos1280_part1.parquet')
df2 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-base_seq512_title_pos1280_part2.parquet')
df3 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-base_seq512_title_pos1280_part3.parquet')
df4 = pd.read_parquet(f'{save_dir}/doc_wikipedia_gte-base_seq512_title_pos1280_part4.parquet')
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
idx = df.groupby(['prompt_id'])['score'].nlargest(EACH_TOPK).reset_index()['level_1']
df = df.loc[idx].reset_index(drop=True)
df['weight'] = 0.91225
docs.append(df)

data = pd.concat(docs, ignore_index=True)
data_with_final_score = data.groupby(['prompt_id', 'id'])['score'].agg('sum').reset_index()
idx = data_with_final_score.groupby(['prompt_id'])['score'].nlargest(TOTAL_TOPK).reset_index()['level_1']
topk = data_with_final_score.loc[idx].reset_index(drop=True)

wikipedia_file_data = topk[['id', 'prompt_id', 'score']]
wikipedia_file_data.to_parquet(f"/kaggle/working/retrive/ensemble_document.parquet")

Writing /kaggle/working/retrive/merge_document.py


In [11]:
!python /kaggle/working/retrive/merge_document.py

In [12]:
%%writefile /kaggle/working/retrive/split_sentence.py
from __future__ import annotations
import os
import sys
import argparse
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable
import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df

def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df
    
def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

def main():
    wikipedia_file_data = pd.read_parquet('/kaggle/working/retrive/ensemble_document.parquet')
    wiki_2023_index = pd.read_parquet('/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet',columns=['id'])
    wiki_2023_index['num_idx'] = range(len(wiki_2023_index))
    wiki_2023_index = wiki_2023_index[wiki_2023_index['id'].isin(wikipedia_file_data['id'])].reset_index(drop=True)
    #wikipedia_file_data['file'] = wikipedia_file_data['id'].map(wiki_2023_index.set_index(['id'])['file'])
    wikipedia_file_data['num_idx'] = wikipedia_file_data['id'].map(wiki_2023_index.set_index(['id'])['num_idx'])
    all_indeces = list(wikipedia_file_data['num_idx'])
    wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id']].drop_duplicates().sort_values(['id']).reset_index(drop=True)


    wiki_text_data = pd.read_parquet('/kaggle/input/wiki-2023-index-partition/wiki_2023_all.parquet',
                                  engine='pyarrow',
                                  filters=[('index', 'in', all_indeces)],
                                  columns=['index', 'id', 'text'])
    del wiki_text_data['index']
    _ = gc.collect()
    libc.malloc_trim(0)

    ## Parse documents into sentences
    processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)

    processed_wiki_text_data['loc'] = 1
    processed_wiki_text_data['location'] = processed_wiki_text_data.groupby(['document_id'])['loc'].transform('cumsum')

    del processed_wiki_text_data['loc']
    processed_wiki_text_data.to_parquet('/kaggle/working/retrive/processed_wiki_text_data.parquet', index=False)#document_id, 
    wikipedia_file_data.to_parquet('/kaggle/working/retrive/wikipedia_file_data.parquet', index=False)
    
if __name__ == '__main__':
    main()

Writing /kaggle/working/retrive/split_sentence.py


In [13]:
!python /kaggle/working/retrive/split_sentence.py

100%|███████████████████████████████████| 1921/1921 [00:00<00:00, 772878.46it/s]
100%|██████████████████████████████████████| 1921/1921 [00:09<00:00, 193.04it/s]


In [14]:
%%writefile /kaggle/working/retrive/search_sentence.py
import warnings
warnings.simplefilter('ignore')
import os
import sys
import argparse
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable
import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--sim-model', default='/kaggle/input/multi-qa-mpnet-base-cos-v1', type=str)
    parser.add_argument('--wiki-path', default='/kaggle/input/wikipedia-20230701', type=str)
    parser.add_argument('--test-csv-path', 
                        default='/kaggle/input/kaggle-llm-science-exam/test.csv', 
                        type=str)
    parser.add_argument('--save-sentence-path', 
                        default="/kaggle/working/retrive/sentence.parquet", 
                        type=str)
    parser.add_argument('--device', default=0, type=int)
    parser.add_argument('--max-length', default=384, type=int)
    parser.add_argument('--batch-size', default=8, type=int)
    parser.add_argument('--num-sentences', default=60, type=int)
    args = parser.parse_args()
    print(f"parsed sentence args: {args}")
    
    trn = pd.read_csv(args.test_csv_path)
    ## Combine all answers
    trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
    ## Search using the prompt and answers to guide the search
    trn['prompt_answer_stem'] = trn['prompt']+ " " + trn['answer_all']

    model = SentenceTransformer(args.sim_model, device='cuda')
    model.max_seq_length = args.max_length
    model = model.half()
    
    ## Parse documents into sentences
    processed_wiki_text_data = pd.read_parquet('/kaggle/working/retrive/processed_wiki_text_data.parquet')
    wikipedia_file_data = pd.read_parquet('/kaggle/working/retrive/wikipedia_file_data.parquet') 
    ## Get embeddings of the wiki text data
    wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                        batch_size=args.batch_size,
                                        device=args.device,
                                        show_progress_bar=True,
                                        convert_to_tensor=True,
                                        normalize_embeddings=True)#.half()
    wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()
    wiki_data_embeddings = np.asarray(wiki_data_embeddings.astype('float32'))
    _ = gc.collect()

    question_embeddings = model.encode(
        trn.prompt_answer_stem.values, 
        batch_size=args.batch_size, 
        device=args.device, 
        show_progress_bar=True, 
        convert_to_tensor=True, 
        normalize_embeddings=True)
    question_embeddings = question_embeddings.detach().cpu().numpy()
    question_embeddings = np.asarray(question_embeddings.astype('float32'))

    sentences = []
    for r in tqdm(trn.itertuples(), total=len(trn)):
        prompt_id = r.Index
        prompt_answer_stem = r.prompt_answer_stem
        prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values
        if prompt_indices.shape[0] > 0:
            prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
            prompt_index.add(wiki_data_embeddings[prompt_indices])
            ## Get the top matches
            ss, ii = prompt_index.search(question_embeddings, args.num_sentences)
            for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
                sentences.append([prompt_id, prompt_answer_stem, processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i]])
    pd.DataFrame(sentences, columns=['prompt_id', 'prompt_answer_stem', 'text']).to_parquet(args.save_sentence_path, index=False)
    
if __name__ == '__main__':
    main()

Writing /kaggle/working/retrive/search_sentence.py


In [15]:
!python /kaggle/working/retrive/search_sentence.py \
--sim-model "/kaggle/input/sentencetransformer-hubs/all-MiniLM-L6-v2" \
--save-sentence-path "/kaggle/working/retrive/sentence_all-MiniLM-L6-v2.parquet"

parsed sentence args: Namespace(sim_model='/kaggle/input/sentencetransformer-hubs/all-MiniLM-L6-v2', wiki_path='/kaggle/input/wikipedia-20230701', test_csv_path='/kaggle/input/kaggle-llm-science-exam/test.csv', save_sentence_path='/kaggle/working/retrive/sentence_all-MiniLM-L6-v2.parquet', device=0, max_length=384, batch_size=8, num_sentences=60)
Batches: 100%|███████████████████████████| 13666/13666 [02:01<00:00, 112.28it/s]
Batches: 100%|██████████████████████████████████| 25/25 [00:00<00:00, 71.09it/s]
100%|█████████████████████████████████████████| 200/200 [00:09<00:00, 21.42it/s]


In [16]:
%%writefile /kaggle/working/retrive/rerank.py
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf


from collections.abc import Iterable

import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

import torch
import torch.nn as nn

from torch.utils.data import Dataset
from transformers import AutoConfig, AutoModel
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding


RERANK_MODEL_NAME = '/kaggle/input/all-mpnet-base-v2'
SEN_RERANK_MODEL = '/kaggle/input/rank-cls/si_all_mpnet_base_v2_len_512_bin_cls/model.pth'
RANK_THRESOLD = 0.65
DEVICE = 0
MAX_LENGTH = 512
BATCH_SIZE = 8

tokenizer = AutoTokenizer.from_pretrained(RERANK_MODEL_NAME)
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

class TextPairDataset(Dataset):
    def __init__(self, text1, text2, tokenizer, target=None, max_len=512) -> None:
        self.text1 = text1
        self.text2 = text2
        self.target = target
        self.tokenzer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text1)
    
    def __getitem__(self, index):
        t1 = self.text1[index]
        t2 = self.text2[index]
        tg = self.target[index] if self.target is not None else -1
        ret = self.tokenzer(t1,t2,
                            truncation=True,
                            max_length=self.max_len)
        ret["labels"] = tg
        return ret
    
class RankModel(nn.Module):
    def __init__(self, model_path) -> None:
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_path)
        self.config.update({
            "output_hidden_states":True,
            "hidden_dropout": 0.0,
            "hidden_dropout_prob":0.0,
            "attention_dropout":0.0,
            "attention_probs_dropout_prob":0.0,
            "layer_norm_eps": 1e-7
        })
        
        self.model = AutoModel.from_pretrained(model_path, config=self.config)
        
        self.attention = nn.Sequential(            
            nn.Linear(self.config.hidden_size, self.config.hidden_size//2),            
            nn.Tanh(),                       
            nn.Linear(self.config.hidden_size//2, 1),
            nn.Softmax(dim=1)
        )
        self.fc = nn.Linear(self.config.hidden_size, 1)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.hidden_states[-1]
        weights = self.attention(last_hidden_states)
        context_vector = torch.sum(weights * last_hidden_states, dim=1)
        out = self.fc(context_vector)
        return out
    
rank_model = RankModel(RERANK_MODEL_NAME)
rank_model.load_state_dict(torch.load(SEN_RERANK_MODEL))
rank_model.to('cuda')
print("rank model name: ", RERANK_MODEL_NAME)
print("load rank model from: ", SEN_RERANK_MODEL)

text_pair_for_rerank_df = pd.read_parquet('/kaggle/working/retrive/sentence_all-MiniLM-L6-v2.parquet')

test_ds = TextPairDataset(text_pair_for_rerank_df['prompt_answer_stem'].tolist(), text_pair_for_rerank_df['text'].tolist(), tokenizer, max_len=MAX_LENGTH)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, 
                     shuffle=False, num_workers=4, 
                     collate_fn=data_collator)

def valid_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    
    with torch.no_grad():
        for d in tqdm(data_loader, total=len(data_loader), desc="Eval: "):
            data = {k: v.to(device) for k, v in d.items()}
            
            targets = data.pop('labels')
            
            outputs = model(data)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return fin_outputs, fin_targets

rank_preds, _ = valid_fn(test_dl, rank_model, 'cuda')
preds = [rp[0] for rp in rank_preds]


def get_rank_id_context(preds, val_df, thresold):
    val_df["pred_probs"] = preds
    ensure_df = val_df.sort_values("pred_probs", ascending=False).groupby(["prompt_id"]).head(5).reset_index(drop=True)
    ensure_df = ensure_df.groupby(["prompt_id"]).agg({'text':' '.join}).reset_index()
    ensure_df = ensure_df[["prompt_id", "text"]]
    ensure_df.columns = ["prompt_id", "text_safe"]
    
    val_df["pred"] = np.where(val_df["pred_probs"]>thresold, 1, 0)
    pred_df = val_df[val_df["pred"]==1].groupby("prompt_id")["text"].unique().reset_index()
    pred_df["context"] = pred_df["text"].apply(lambda x: " ".join(x))
    eval_df = ensure_df.merge(pred_df, how='left', on="prompt_id")
    eval_df["context"] = np.where(eval_df["context"].isna(),eval_df["text_safe"],eval_df["context"])
    eval_df["context"] = eval_df["context"].fillna(" ")
    eval_df = eval_df.sort_values("prompt_id")
    eval_df = eval_df.reset_index(drop=True)
    return eval_df["context"]

trn = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
# trn = pd.read_csv("/kaggle/input/gte-base-context/gte-base-valid-context-970.csv")
trn['context'] = get_rank_id_context(preds, text_pair_for_rerank_df, RANK_THRESOLD)
trn[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("/kaggle/working/retrive/test_context.csv", index=False)

Writing /kaggle/working/retrive/rerank.py


In [17]:
###rerank
!python /kaggle/working/retrive/rerank.py 

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
rank model name:  /kaggle/input/all-mpnet-base-v2
load rank model from:  /kaggle/input/rank-cls/si_all_mpnet_base_v2_len_512_bin_cls/model.pth
Eval: 100%|█████████████████████████████████| 1500/1500 [02:19<00:00, 10.75it/s]


In [18]:
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf

from collections.abc import Iterable

import faiss
from faiss import write_index, read_index, read_VectorTransform

from sentence_transformers import SentenceTransformer

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")

class CFG:
    EMB_MODEL = "/kaggle/input/gte-base"
    INDEX_PATH = "/kaggle/input/wikipedia-stem-index/parsed_gte-base.index"
    WIKI_PARSED_PLAINTEXT_PATH = "/kaggle/input/llm-models2/content/wikipedia-stem-plaintext/parsed.parquet"
    WIKI_COHERE_PLAINTEXT_PATH = "/kaggle/input/llm-models2/content/wikipedia-stem-plaintext/cohere.parquet"
    
    MAX_LENGTH = 512
    BATCH_SIZE = 32
    MAX_DOC_NUM = 10
    
#     DEBUG = False
import pandas as pd
trn = pd.read_csv("/kaggle/working/retrive/test_context.csv")
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)

## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " +trn['prompt'] + " " +trn['prompt'] + " " + trn['answer_all']
model = SentenceTransformer(CFG.EMB_MODEL, device='cuda')
model.max_seq_length = CFG.MAX_LENGTH


prompt_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=CFG.BATCH_SIZE, device=0, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()


res = faiss.StandardGpuResources()
sentence_index = read_index(CFG.INDEX_PATH)
sentence_index_gpu = faiss.index_cpu_to_gpu(res, 0, sentence_index)

QUERY_SIZE = 8
search_score = []
search_index = []
total = prompt_embeddings.shape[0]
for i in tqdm(range(0, total, QUERY_SIZE)):
    ss, si = sentence_index_gpu.search(prompt_embeddings[i:i+QUERY_SIZE], CFG.MAX_DOC_NUM)
    search_score.append(ss)
    search_index.append(si)
    
search_score = np.concatenate(search_score)
search_index = np.concatenate(search_index)


## Save memory - delete sentence_index since it is no longer necessary
del sentence_index
del sentence_index_gpu
del prompt_embeddings
#del model
_ = gc.collect()
#torch.cuda.empty_cache()
libc.malloc_trim(0)

df = pd.read_parquet(CFG.WIKI_PARSED_PLAINTEXT_PATH,columns=['text'])

contexts = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    context = ""
    scr_idx = idx
    context_list = df.loc[scr_idx].text.tolist()
    context += " ".join(context_list)
    contexts.append(context)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

trn['context1'] = contexts

df = pd.read_parquet(CFG.WIKI_COHERE_PLAINTEXT_PATH,columns=['text'])

contexts = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    context = ""
    scr_idx = idx
    context_list = df.loc[scr_idx].text.tolist()
    context += " ".join(context_list)
    contexts.append(context)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

trn['context2'] = contexts

save_cols = ["prompt", "context", "context1", "context2", "A", "B", "C", "D", "E"]
trn[save_cols].to_csv("/kaggle/working/retrive/test_context.csv", index=False)

del trn
_ = gc.collect()
libc.malloc_trim(0)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

1

In [19]:
import numpy as np
import pandas as pd
test_df = pd.read_csv("/kaggle/working/retrive/test_context.csv")
test_df.index = list(range(len(test_df)))
test_df['id'] = list(range(len(test_df)))
test_df['answer'] = 'A'
test_df = test_df.replace(np.NaN, 'none')
test_df.head()

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForMultipleChoice
import pytorch_lightning as pl
class CustomModel(nn.Module):
    def __init__(self, config, *, dropout=0.2, pretrained=False):
        super().__init__()

        # Transformer
        self.config = config
        if pretrained:
            self.transformer = AutoModelForMultipleChoice.from_pretrained(model_dir, config=self.config)
        else:
            self.transformer = AutoModelForMultipleChoice.from_config(self.config)

    def _init_weights(self, module, config):
        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.transformer(input_ids, attention_mask, token_type_ids=token_type_ids)
        x = out['logits']
        return x
    
class CustomPLModel(pl.LightningModule):
    def __init__(self,model_dir):
        super(CustomPLModel,self).__init__()
        self.net = CustomModel(model_dir)

    def forward(self, input_ids,attention_mask,token_type_ids):
        return self.net(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
    
def load_weights(model_dir):
    config_path = f'{model_dir}/deberta-v3-large-hf-weights_config.pth'
    model_path = f'{model_dir}/deberta-v3-large-hf-weights_foldextra_swa.pth'
    config = torch.load(config_path)
    config._name_or_path = f'{model_dir}/tokenizer'
#     print(config)
    print(f'model_path is {model_path}')
    
    net = CustomPLModel(config).cuda()
    state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)['state_dict']
    net.load_state_dict(state_dict, strict=False)  # True
    model = net.eval()
    return model
import gc
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_dir_path1 = '/kaggle/input/llm-models2/deberta-v3-large-max-len1024-3e-311297-val971/best'#lb=0.860
#/kaggle/input/llm-models2/deberta-v3-large-max-len1024-3e-311297-val971/best/deberta-v3-large-hf-weights_config.pth
model_dir_path2 = "/kaggle/input/llm-models2/deberta-v3-large-max-len1024-lr5-3e-322538-val970/swa" #lb=0.853
tokenizer = AutoTokenizer.from_pretrained(f'{model_dir_path1}/tokenizer')
model_dir_list = [
    model_dir_path1,
    model_dir_path2,
]
models_list = []
for index, model_dir in enumerate(model_dir_list):
    print(f'Index {index} model dir is {model_dir}')
    model = load_weights(model_dir)
    models_list.append(model)
    del model
    gc.collect()
print(f'ensemble {len(models_list)} models!')

Index 0 model dir is /kaggle/input/llm-models2/deberta-v3-large-max-len1024-3e-311297-val971/best
model_path is /kaggle/input/llm-models2/deberta-v3-large-max-len1024-3e-311297-val971/best/deberta-v3-large-hf-weights_foldextra_swa.pth
Index 1 model dir is /kaggle/input/llm-models2/deberta-v3-large-max-len1024-lr5-3e-322538-val970/swa
model_path is /kaggle/input/llm-models2/deberta-v3-large-max-len1024-lr5-3e-322538-val970/swa/deberta-v3-large-hf-weights_foldextra_swa.pth
ensemble 2 models!


In [20]:
MAX_INPUT = 1536

class LlmseDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        first_sentence = [ "[CLS] " + example['context'] ] * 5
        second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                    max_length=MAX_INPUT, add_special_tokens=False)
        tokenized_example['label'] = self.option_to_index[example['answer']]
            
        return tokenized_example

In [21]:
class LlmseDataset1(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        first_sentence = [ "[CLS] " + example['context1'] ] * 5
        second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                    max_length=MAX_INPUT, add_special_tokens=False)
        tokenized_example['label'] = self.option_to_index[example['answer']]
            
        return tokenized_example

In [22]:
class LlmseDataset2(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        first_sentence = [ "[CLS] " + example['context2'] ] * 5
        second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                    max_length=MAX_INPUT, add_special_tokens=False)
        tokenized_example['label'] = self.option_to_index[example['answer']]
            
        return tokenized_example

In [23]:
from typing import Optional, Union
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [24]:
import os
from torch.utils.data import DataLoader
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)


#### wiki document data
test_ds = LlmseDataset(test_df)
test_dl = DataLoader(
    test_ds, 
    batch_size=1, 
    shuffle=False, 
    collate_fn=data_collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
    drop_last=False)


###stem 270k parsed data
test_ds1 = LlmseDataset1(test_df)
test_dl1 = DataLoader(
    test_ds1, 
    batch_size=1, 
    shuffle=False, 
    collate_fn=data_collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
    drop_last=False)

###stem 270k cohere data
test_ds2 = LlmseDataset2(test_df)
test_dl2 = DataLoader(
    test_ds2, 
    batch_size=1, 
    shuffle=False, 
    collate_fn=data_collator,
    num_workers=os.cpu_count(),
    pin_memory=True,
    drop_last=False)

In [25]:
from tqdm.auto import tqdm

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / e_x.sum(axis=-1, keepdims = True) 

def do_inference(model,test_dl,device):
    y_preds = []
    with tqdm(test_dl, leave=False) as pbar:
        with torch.no_grad():
            for idx, batch in enumerate(pbar):
                inp_ids = batch['input_ids'].to(device)
                att_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)

                y_pred = model(input_ids=inp_ids, 
                               attention_mask=att_mask, 
                               token_type_ids=token_type_ids)

                y_pred = y_pred.to(torch.float)
                y_preds.append(y_pred.cpu())
    preds = torch.cat(y_preds)
    return preds

In [26]:
all_preds = 0
for model in models_list:
    preds0 = do_inference(model,test_dl,device)
    preds1 = do_inference(model,test_dl1,device)
    #preds2 = do_inference(model,test_dl2,device)
    preds = (softmax(preds0.numpy()) + softmax(preds1.numpy())*1.25  ) / 2.25
    all_preds += preds
    del model,preds0, preds1, preds
    gc.collect()
all_preds /= len(models_list)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/200 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/200 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/200 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/200 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [27]:
predictions_as_ids = np.argsort(-all_preds, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
test_df['prediction'] = [' '.join(row) for row in predictions_as_answer_letters[:, :3]]
submission = test_df[['id', 'prediction']]
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,prediction
0,0,D E B
1,1,A B E
2,2,A C D
3,3,C A E
4,4,D A C


In [28]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [29]:
if len(submission) == 200:
    train = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
    preds = [pred for pred in submission['prediction']]
    print(MAP_at_3(preds, train["answer"]))

###cv=0.9925

0.995


In [30]:
!rm -r /kaggle/working/datasets-2.14.4-py3-none-any.whl
!rm -r /kaggle/working/sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
