In [1]:
%%capture
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import pandas as pd
import re
import csv
from tqdm import tqdm
import gc

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Math-7B-Instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


==((====))==  Unsloth 2024.10.7: Fast Qwen2 patching. Transformers = 4.45.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.52k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # 4x longer contexts auto supported!
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.7 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


# Load the dataset



In [5]:
import pandas as pd
df=pd.read_csv("/kaggle/input/dlsprint3/train.csv")

In [6]:
prompt = """Your task is to solve a math problem given in Bengali. Follow these steps:

1.Translate the problem into English.
2.Identify the key information and variables.
3.Outline your approach to solving the problem.
4.Show your work step by step, explaining each step clearly.
5.Write your final answer within \boxed{{}} notation.
6.Provide a brief explanation of the result.

Ensure your solution is clear and easy to follow. If there are multiple possible answers, mention this and explain why.
Problem:
{}
Solution:
{}
"""

EOS_TOKEN = tokenizer.eos_token
 # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["Problem"]
    outputs      = examples["Answer"]
    texts = []
    for  input, output in zip( inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

In [7]:
tokenizer.padding_side = 'right'

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
#     eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 0,
#         max_steps = 10,
        num_train_epochs = 1,
        learning_rate = 5e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
#         load_best_model_at_end = True,
#         save_strategy = 'steps',
#         eval_strategy = 'steps',
#         save_total_limit = 1,
#         metric_for_best_model = 'loss'
    ),
)

Map (num_proc=2):   0%|          | 0/209 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 209 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 26
 "-____-"     Number of trainable parameters = 40,370,176


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --no-deps unsloth transformers git+https://github.com/huggingface/trl.git`


Step,Training Loss
10,2.6392
20,2.2163


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [10]:
import re

In [11]:
def extract_answer(result):
    match = re.search(r'\\boxed{(.*?)}', result)
    if match:
        boxed_content = match.group(1)
        digits = ''.join(filter(str.isdigit, boxed_content))
        if digits:
            return int(digits)
    return None

In [12]:
def majority_answer(answers):
    answers = [answer for answer in answers if answer is not None]

    if not answers:
        return None
    
    counts = {}
    for answer in answers:
        if answer in counts:
            counts[answer] += 1
        else:
            counts[answer] = 1

    max_answer = None
    max_count = 0
    
    for answer, count in counts.items():
        if count > max_count:
            max_answer = answer
            max_count = count
    
    return max_answer

In [13]:
test_df=pd.read_csv("/kaggle/input/dlsprint3/test.csv")

In [14]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def generate_response(input_text):
    prompt_1 = prompt.format(input_text,"")
    inputs = tokenizer([prompt_1], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=2000, use_cache=True,temperature=1, min_p=0.1)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # Extract the part where SUBJ or OBJ is written
#     response_text = response[0].split("### Output:\n")[1].strip().split('\n')[0]
#     if(len(response_text)>1):
#         response_text=0
    
    answers = [extract_answer(result) for result in response]
    
#     print(majority_answer(answers))
    
    return majority_answer(answers)

In [15]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [16]:
file = open('submission.csv', 'w', encoding='utf-8')
writer = csv.writer(file)
writer.writerow(['ID', 'Answer'])

for row in tqdm(test_df.values):
    id = row[0]
    problem = row[1]    
    answer = generate_response(problem)
    print(answer)
    
    
    if answer is None:
        answer = 36
    print(answer)
        
    writer.writerow([id, answer])
    
file.close()

  1%|          | 1/100 [00:31<52:17, 31.69s/it]

4
4


  2%|▏         | 2/100 [01:11<59:31, 36.45s/it]

10
10


  3%|▎         | 3/100 [03:22<2:08:53, 79.72s/it]

None
36


  4%|▍         | 4/100 [04:50<2:12:29, 82.81s/it]

4048
4048


  5%|▌         | 5/100 [05:29<1:46:03, 66.98s/it]

91
91


  6%|▌         | 6/100 [06:09<1:30:54, 58.02s/it]

0
0


  7%|▋         | 7/100 [08:22<2:07:55, 82.53s/it]

None
36


  8%|▊         | 8/100 [09:56<2:11:59, 86.09s/it]

225
225


  9%|▉         | 9/100 [10:21<1:41:24, 66.87s/it]

14
14


 10%|█         | 10/100 [12:35<2:11:26, 87.63s/it]

None
36


 11%|█         | 11/100 [13:33<1:56:41, 78.67s/it]

2
2


 12%|█▏        | 12/100 [14:40<1:50:04, 75.05s/it]

8
8


 13%|█▎        | 13/100 [15:39<1:41:48, 70.21s/it]

2001
2001


 14%|█▍        | 14/100 [16:02<1:20:18, 56.03s/it]

1000
1000


 15%|█▌        | 15/100 [17:59<1:45:09, 74.23s/it]

None
36


 16%|█▌        | 16/100 [19:08<1:41:51, 72.75s/it]

240
240


 17%|█▋        | 17/100 [20:32<1:45:33, 76.31s/it]

11
11


 18%|█▊        | 18/100 [22:19<1:56:43, 85.40s/it]

None
36


 19%|█▉        | 19/100 [23:17<1:44:04, 77.09s/it]

17
17


 20%|██        | 20/100 [23:39<1:20:56, 60.70s/it]

24
24


 21%|██        | 21/100 [24:30<1:15:57, 57.69s/it]

97
97


 22%|██▏       | 22/100 [24:58<1:03:27, 48.82s/it]

96
96


 23%|██▎       | 23/100 [26:19<1:14:52, 58.34s/it]

15
15


 24%|██▍       | 24/100 [28:22<1:38:25, 77.71s/it]

39
39


 25%|██▌       | 25/100 [28:50<1:18:50, 63.07s/it]

145
145


 26%|██▌       | 26/100 [30:10<1:23:54, 68.03s/it]

4096
4096


 27%|██▋       | 27/100 [31:43<1:31:56, 75.57s/it]

6
6


 28%|██▊       | 28/100 [33:55<1:50:55, 92.44s/it]

None
36


 29%|██▉       | 29/100 [34:21<1:25:49, 72.53s/it]

29
29


 30%|███       | 30/100 [36:33<1:45:34, 90.50s/it]

None
36


 31%|███       | 31/100 [38:47<1:59:00, 103.48s/it]

100
100


 32%|███▏      | 32/100 [39:03<1:27:32, 77.25s/it] 

20
20


 33%|███▎      | 33/100 [40:16<1:24:46, 75.92s/it]

None
36


 34%|███▍      | 34/100 [41:39<1:25:47, 77.99s/it]

1224
1224


 35%|███▌      | 35/100 [42:48<1:21:34, 75.30s/it]

1
1


 36%|███▌      | 36/100 [43:25<1:08:13, 63.96s/it]

5
5


 37%|███▋      | 37/100 [44:27<1:06:25, 63.26s/it]

144789
144789


 38%|███▊      | 38/100 [45:00<55:59, 54.18s/it]  

50
50


 39%|███▉      | 39/100 [45:47<52:50, 51.98s/it]

252
252


 40%|████      | 40/100 [48:01<1:16:33, 76.56s/it]

9
9


 41%|████      | 41/100 [48:43<1:05:15, 66.36s/it]

102
102


 42%|████▏     | 42/100 [49:53<1:05:03, 67.30s/it]

60
60


 43%|████▎     | 43/100 [50:33<56:10, 59.13s/it]  

18
18


 44%|████▍     | 44/100 [51:42<58:04, 62.21s/it]

4
4


 45%|████▌     | 45/100 [52:23<50:59, 55.62s/it]

None
36


 46%|████▌     | 46/100 [53:14<48:57, 54.40s/it]

65
65


 47%|████▋     | 47/100 [54:02<46:13, 52.34s/it]

32342123101201
32342123101201


 48%|████▊     | 48/100 [54:56<45:50, 52.90s/it]

3125
3125


 49%|████▉     | 49/100 [57:09<1:05:28, 77.02s/it]

None
36


 50%|█████     | 50/100 [57:41<52:48, 63.37s/it]  

525
525


 51%|█████     | 51/100 [58:12<43:46, 53.60s/it]

44
44


 52%|█████▏    | 52/100 [58:59<41:21, 51.70s/it]

None
36


 53%|█████▎    | 53/100 [59:38<37:38, 48.05s/it]

15
15


 54%|█████▍    | 54/100 [1:00:12<33:37, 43.85s/it]

1
1


 55%|█████▌    | 55/100 [1:01:10<35:57, 47.95s/it]

462
462


 56%|█████▌    | 56/100 [1:02:31<42:23, 57.80s/it]

5
5


 57%|█████▋    | 57/100 [1:03:11<37:39, 52.55s/it]

21
21


 58%|█████▊    | 58/100 [1:03:41<32:08, 45.92s/it]

0
0


 59%|█████▉    | 59/100 [1:04:23<30:34, 44.74s/it]

None
36


 60%|██████    | 60/100 [1:05:30<34:16, 51.42s/it]

12
12


 61%|██████    | 61/100 [1:05:53<27:52, 42.88s/it]

1746
1746


 62%|██████▏   | 62/100 [1:06:17<23:27, 37.04s/it]

10
10


 63%|██████▎   | 63/100 [1:06:57<23:25, 37.98s/it]

22335577
22335577


 64%|██████▍   | 64/100 [1:07:47<25:01, 41.72s/it]

4
4


 65%|██████▌   | 65/100 [1:09:25<34:01, 58.34s/it]

1349
1349


 66%|██████▌   | 66/100 [1:09:58<28:51, 50.93s/it]

44100
44100


 67%|██████▋   | 67/100 [1:10:52<28:32, 51.89s/it]

1099
1099


 68%|██████▊   | 68/100 [1:11:47<28:02, 52.59s/it]

465
465


 69%|██████▉   | 69/100 [1:12:14<23:16, 45.06s/it]

11
11


 70%|███████   | 70/100 [1:12:44<20:17, 40.59s/it]

2
2


 71%|███████   | 71/100 [1:13:12<17:45, 36.74s/it]

16
16


 72%|███████▏  | 72/100 [1:14:15<20:53, 44.76s/it]

1250
1250


 73%|███████▎  | 73/100 [1:15:03<20:33, 45.68s/it]

256
256


 74%|███████▍  | 74/100 [1:17:14<30:54, 71.34s/it]

None
36


 75%|███████▌  | 75/100 [1:17:40<24:02, 57.68s/it]

3
3


 76%|███████▌  | 76/100 [1:18:37<22:57, 57.38s/it]

None
36


 77%|███████▋  | 77/100 [1:19:25<20:54, 54.53s/it]

168
168


 78%|███████▊  | 78/100 [1:19:51<16:55, 46.14s/it]

6
6


 79%|███████▉  | 79/100 [1:20:16<13:54, 39.75s/it]

57
57


 80%|████████  | 80/100 [1:20:43<11:58, 35.94s/it]

1
1


 81%|████████  | 81/100 [1:21:48<14:04, 44.43s/it]

136
136


 82%|████████▏ | 82/100 [1:22:55<15:23, 51.28s/it]

613
613


 83%|████████▎ | 83/100 [1:24:45<19:32, 69.00s/it]

6
6


 84%|████████▍ | 84/100 [1:25:22<15:48, 59.30s/it]

16
16


 85%|████████▌ | 85/100 [1:25:57<12:59, 52.00s/it]

102
102


 86%|████████▌ | 86/100 [1:26:51<12:18, 52.78s/it]

1
1


 87%|████████▋ | 87/100 [1:27:35<10:49, 49.96s/it]

50
50


 88%|████████▊ | 88/100 [1:28:59<12:02, 60.19s/it]

25
25


 89%|████████▉ | 89/100 [1:29:28<09:19, 50.82s/it]

6
6


 90%|█████████ | 90/100 [1:30:23<08:40, 52.08s/it]

105
105


 91%|█████████ | 91/100 [1:31:07<07:27, 49.77s/it]

28
28


 92%|█████████▏| 92/100 [1:32:10<07:09, 53.66s/it]

169
169


 93%|█████████▎| 93/100 [1:32:50<05:48, 49.73s/it]

9
9


 94%|█████████▍| 94/100 [1:33:21<04:24, 44.05s/it]

24
24


 95%|█████████▌| 95/100 [1:34:10<03:46, 45.38s/it]

1012
1012


 96%|█████████▌| 96/100 [1:34:38<02:40, 40.16s/it]

9
9


 97%|█████████▋| 97/100 [1:36:49<03:22, 67.55s/it]

44484
44484


 98%|█████████▊| 98/100 [1:38:00<02:16, 68.46s/it]

521
521


 99%|█████████▉| 99/100 [1:38:25<00:55, 55.36s/it]

71
71


100%|██████████| 100/100 [1:38:56<00:00, 59.37s/it]

4
4



