# Inference

In [1]:
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers
!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl

!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Processing ./sentence-transformers
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=126134 sha256=be368231afe1403a4374877a9e392ebe39365facd8deea77306a4b250cb093e8
  Stored in directory: /root/.cache/pip/wheels/6c/ea/76/d9a930b223b1d3d5d6aff69458725316b0fe205b854faf1812
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
Processing /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl
Installing collected packages: blingfire
Successfully installed blingfir

In [2]:
import os
import gc
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm
import blingfire as bf
from __future__ import annotations
from collections.abc import Iterable
import faiss
from faiss import write_index, read_index
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

import torch
import ctypes
libc = ctypes.CDLL("libc.so.6")
from dataclasses import dataclass
from typing import Optional, Union
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 9,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df

In [4]:
def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df

In [5]:
def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 9,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [6]:
SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 32
WIKI_PATH = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(WIKI_PATH)

In [7]:
trn = pd.concat([
    pd.read_csv('/kaggle/input/sim-data-2/extra_val.csv'),
    pd.read_csv('/kaggle/input/sim-data-2/extra_eval_mos.csv'),
    pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv'),
])

trn = trn[[ 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer']]

In [8]:
model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half()

In [9]:
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")


In [10]:
prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
_ = gc.collect()

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
search_score, search_index = sentence_index.search(prompt_embeddings, 6)

In [12]:
del sentence_index
del prompt_embeddings
_ = gc.collect()
libc.malloc_trim(0)

1

In [13]:
df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                     columns=['id', 'file'])

In [14]:
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

## Save memory - delete df since it is no longer necessary
del df
_ = gc.collect()
libc.malloc_trim(0)

  0%|          | 0/298 [00:00<?, ?it/s]

1

In [15]:
wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

  0%|          | 0/28 [00:00<?, ?it/s]

In [16]:
processed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)


  0%|          | 0/1674 [00:00<?, ?it/s]

  0%|          | 0/1674 [00:00<?, ?it/s]

In [17]:
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=BATCH_SIZE,
                                    device=DEVICE,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)#.half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/2716 [00:00<?, ?it/s]

In [18]:
_ = gc.collect()


In [19]:
## Combine all answers
trn['answer_all'] = trn.apply(lambda x: " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)


## Search using the prompt and answers to guide the search
trn['prompt_answer_stem'] = trn['prompt'] + " " + trn['answer_all']

In [20]:
question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
question_embeddings = question_embeddings.detach().cpu().numpy()

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
## Parameter to determine how many relevant sentences to include
NUM_SENTENCES_INCLUDE = 22

## List containing just Context
contexts = []

for r in tqdm(trn.itertuples(), total=len(trn)):

    prompt_id = r.Index

    prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values

    if prompt_indices.shape[0] > 0:
        prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
        prompt_index.add(wiki_data_embeddings[prompt_indices])

        context = ""
        
        ## Get the top matches
        ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)
        for _s, _i in zip(ss[prompt_id], ii[prompt_id]):
            context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + " "
        
    contexts.append(context)

  0%|          | 0/298 [00:00<?, ?it/s]

In [22]:
trn['context'] = contexts


In [23]:
trn[["prompt", "context", "A", "B", "C", "D", "E",'answer']].to_csv("./test_context.csv", index=False)

In [24]:
test_df = pd.read_csv("test_context.csv")
test_df.index = list(range(len(test_df)))
test_df['id'] = list(range(len(test_df)))
test_df["prompt"] = test_df["context"] + " #### " +  test_df["prompt"]
valid_label = test_df.loc[:, 'answer'].values

In [2]:
# test_df = pd.read_csv("/kaggle/input/sim-data-2/val298_context.csv")
# test_df.index = list(range(len(test_df)))
# test_df['id'] = list(range(len(test_df)))
# # test_df["prompt"] = test_df["context"].apply(lambda x: x[:1750]) + " #### " +  test_df["prompt"]
# valid_label = test_df.loc[:, 'answer'].values

In [25]:
test_df['prompt'][0]

'Methanol acquired the name wood alcohol because it was once produced chiefly by the destructive distillation of wood. Methanol (also called methyl alcohol and wood spirit, amongst other names) is an organic chemical and the simplest aliphatic alcohol, with the formula CH3OH (a methyl group linked to a hydroxyl group, often abbreviated as MeOH). Grade "AA" methanol contains trace amounts of ethanol as well. This addition of methanol exempts industrial ethanol (commonly known as "denatured alcohol" or "methylated spirit") from liquor excise taxation in the U.S. and other countries. ==See also== *Aminomethanol *Methanol (data page) *Trimethyl carbinol ==References== ==Further reading== *Robert Boyle, The Sceptical Chymist (1661) – contains account of distillation of wood alcohol. ==External links== * * Methyl Alcohol (Methanol) CDC/NIOSH, links to safety information * CDC – NIOSH Pocket Guide to Chemical Hazards – Methyl Alcohol * Methanol Fact Sheet – National Pollutant Inventory Catego

In [26]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [27]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [28]:
model_dir = "/kaggle/input/llm-science-run-context-2"


In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)


In [30]:
tokenized_test_dataset = Dataset.from_pandas(test_df[['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

Map:   0%|          | 0/298 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [31]:
model_dir = "/kaggle/input/llm-science-run-context-2"
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_807 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_807.append(outputs.logits.cpu().detach())

test_807 = torch.cat(test_807)



model_dir = "/kaggle/input/llm-se-debertav3-large"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_771 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_771.append(outputs.logits.cpu().detach())

test_771 = torch.cat(test_771)


model_dir = "/kaggle/input/zak-scratchfrozen-context200/Zak_Frozen_context"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_788 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_788.append(outputs.logits.cpu().detach())

test_788 = torch.cat(test_788)


model_dir = "/kaggle/input/the-cute-context200/Zak_Frozen_context_Cute"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_791 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_791.append(outputs.logits.cpu().detach())

test_791 = torch.cat(test_791)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [32]:

model_dir = "/kaggle/input/frozen160clip/Zak_Frozen_context"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_p784 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_p784.append(outputs.logits.cpu().detach())

test_p784 = torch.cat(test_p784)





In [33]:
model_dir = "/kaggle/input/frozen160clip/Zak_Frozen_context"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
test_p779 = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_p779.append(outputs.logits.cpu().detach())

test_p779 = torch.cat(test_p779)

In [34]:
model_dir = "/kaggle/input/new-partner/model_v2"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()
new_partner = []

for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    new_partner.append(outputs.logits.cpu().detach())

new_partner = torch.cat(new_partner)

In [39]:
test_df = pd.concat([
    pd.read_csv('/kaggle/input/sim-data-2/extra_val.csv'),
    pd.read_csv('/kaggle/input/sim-data-2/extra_eval_mos.csv'),
    pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv'),
])
tokenized_test_dataset = Dataset.from_pandas(test_df[['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer']].drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

  0%|          | 0/298 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [40]:
model = AutoModelForMultipleChoice.from_pretrained(f'/kaggle/input/2023kagglellm-deberta-v3-large-model1').cuda()
model.eval()
preds = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    preds.append(outputs.logits.cpu().detach())

test756 = torch.cat(preds)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [35]:
def map3(y_true, y_pred):
    m = (y_true.reshape((-1,1)) == y_pred)
    return np.mean(np.where(m.any(axis=1), m.argmax(axis=1)+1, np.inf)**(-1))

In [41]:
valid_pred_ids = np.argsort(-test_p779, 1)
valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
valid_map3 = map3(valid_label, valid_pred_letters)
print(valid_map3)

0.7555928411633109


In [None]:
# test_807    0.860738255033557
# test_791    0.8288590604026845
# test_p784   0.8182326621923937
# test_788    0.8126398210290828
# test_p779   0.8182326621923937
# test_771    0.7908277404921701
# test756     0.8036912751677852
# new_partner 0.7841163310961969

The Loop time ! 


4 models

In [29]:
for s in range (0,100):
  a = 100 - s
  for i in range (a):
    z = a - i
    
#     predictions = ( s * test_807 + i * test_791  +  z * new_partner ) / 100     
#     0.8780760626398211 22 45 33
    predictions = ( s * test_807 + i * test_791  +  z * new_partner ) / 100     

    valid_pred_ids = np.argsort(-predictions, 1)
    valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
    valid_map3 = map3(valid_label, valid_pred_letters)

    if valid_map3 > 0.868:
        print(valid_map3 ,s, i , z)

0.8680089485458614 31 24 45
0.8685682326621925 31 25 44
0.8680089485458614 31 32 37
0.8680089485458614 31 33 36
0.8680089485458612 32 20 48
0.8680089485458612 32 21 47
0.8680089485458612 32 22 46
0.8680089485458614 32 25 43
0.8680089485458614 36 26 38
0.8680089485458612 55 27 18
0.8680089485458612 56 25 19
0.8680089485458612 56 26 18
0.8680089485458612 56 27 17
0.8680089485458612 57 25 18
0.8680089485458612 57 26 17


In [None]:
# test_807    0.860738255033557
# test_791    0.8288590604026845
# test_p784   0.8182326621923937
# test_788    0.8126398210290828
# test_p779   0.8182326621923937
# test_771    0.7908277404921701
# test756     0.8036912751677852
# new_partner 0.7841163310961969

In [43]:
for s in range (100):
  a = 100 - s
  for i in range (a):
    z = a - i
    for m in range (z):
      x = z - m
      # print (s+ i + m + x)
# 0.8691275167785235 38 19 40 3
# 0.8825503355704698 15 28 22 35

      # test_pred = ((i * test_pred_161_dash ) + (m * test_smsm_eff) + (x * test_smsm)) / 100
      predictions = ( x * test_807 + i * test756  +  m * new_partner +  s * test_791) / 100     

      valid_pred_ids = np.argsort(-predictions, 1)
      valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
      valid_map3 = map3(valid_label, valid_pred_letters)
        
      if valid_map3 >=  0.881431767337807  :
        print(valid_map3 ,x, i , m , s)

0.8814317673378077 14 35 24 27
0.8825503355704698 14 31 23 32
0.8814317673378077 12 31 25 32
0.8814317673378077 13 29 25 33
0.8814317673378077 14 31 22 33
0.8814317673378077 13 29 24 34
0.8819910514541388 11 29 26 34
0.8814317673378077 14 32 20 34
0.8814317673378077 13 33 20 34
0.8825503355704698 15 28 22 35
0.8814317673378077 13 29 23 35
0.8814317673378077 13 32 20 35
0.8814317673378077 13 33 19 35
0.8814317673378077 12 34 19 35
0.8814317673378077 8 48 1 43


5 models

In [44]:
for s in range(0, 100):
  a = 100 - s
  for i in range(a):
    z = a - i
    for m in range(z):
      x = z - m
      for d in range(x):
        b = x - d  
        print (s+ i + m + d + b)

      # test_pred = ((i * test_pred_161_dash ) + (m * test_smsm_eff) + (x * test_smsm)) / 100
        predictions = ( s * test_807 + i * test_791  +  m * new_partner +  d * test756 + b * test_p784) / 100  


        valid_pred_ids = np.argsort(-predictions, 1)
        valid_pred_letters = np.array(list('ABCDE'))[valid_pred_ids][:, :3]
        valid_map3 = map3(valid_label, valid_pred_letters)

        if valid_map3 >= 0.882:
            print(valid_map3 ,s, i , m , d , b)

0.8825503355704698 14 31 23 31 1


In [None]:
test_predictions = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)

predictions_as_ids = np.argsort(-test_predictions, 1)

predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
# predictions_as_answer_letters[:3]

predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

In [None]:
submission = test_df[['id', 'prediction']]
submission.to_csv('submission.csv', index=False)