In [None]:
# credits: https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
# credits: https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

## Imports and Config

In [None]:
import gc
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

PRIVATE = True
device = 'cuda'
MODEL_PATH = "/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1"


## Data Load

In [None]:
train = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
test.head()

## Model Load

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# To prevent GPU memory overflow in Mixtral8x7b
config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    trust_remote_code = True,
    quantization_config=quantization_config,
    config=config
)

In [None]:
def gen_prompt(problem):
    
    return f"""
Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n
### Instruction:\n{problem}\n\n
### Response: Let's think step by step. The final response should be a single number in the last line of your response.
"""

def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return int(''.join(out))

In [None]:
answers = []

for i in tqdm(range(len(test))):
    try:
        id_ = test['id'].loc[i]
        problem = test['problem'].loc[i]
        query_prompt = gen_prompt(problem)
        
        messages = [
            {
                "role": "user",
                "content": query_prompt
            }
        ]
        
        ## Tokenizer -> convert into pytorch tensors and store on the GPU
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

        # Run it through the model, no gradient descent, because this is inference. We are not training
        with torch.no_grad():
            encoded_output = model.generate(inputs, max_new_tokens=1500, do_sample=False, pad_token_id=tokenizer.eos_token_id)

        # Decode the output through the same tokenizer
        decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True).replace(query_prompt, '').replace("[INST]", "").replace("[/INST]", "").strip()
    
        print(i)
        print(decoded_output)
        
        answer = decoded_output.split('\n')[-1]
        answer = naive_parse(answer)
        print(answer)
        answer = int(answer) % 1000
        print(answer)
        answers.append(answer)
        torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(e)
        answers.append(0)

In [None]:
test['answer'] = answers

In [None]:
test[['id','answer']].to_csv("submission.csv", header=True, index=False)

In [None]:
test[['id','answer']].head()