In [1]:
!pip install unsloth
import unsloth
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.18-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0

In [2]:
from unsloth import is_bfloat16_supported  #checking if it works on kaggle t4
import torch 
import re

max_seq_length=512  
lora_rank=32  

#model and tokenizer
model, tokenizer=FastLanguageModel.from_pretrained(
    model_name="meta-llama/meta-Llama-3.1-8B-Instruct",  
    max_seq_length=768, 
    load_in_4bit=True,  #i used it so as to reduce memory usage
    fast_inference=False,  #u may use if you have linux
    max_lora_rank=lora_rank,  
    gpu_memory_utilization =0.85,
)



==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [3]:
#peft with lora
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  #this is rank parameter
    target_modules=[
        "v_proj", "o_proj","q_proj", "k_proj",  #attention
        "gate_proj", "down_proj", "up_proj",    # mlp
    ],  #layers to be finetuned by lora
    lora_alpha = 64,  
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=max_seq_length,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
import pandas as pd
df = pd.read_csv('/kaggle/input/dataset/grpo_csv.csv', encoding='ISO-8859-1')

df['Question'] = df['Question'].fillna('[EMPTY QUESTION]').astype(str)
df['Answer'] = df['Answer'].fillna('<reasoning>[EMPTY ANSWER]</reasoning><answer>[NO ANSWER]</answer>').astype(str)

In [5]:
df.to_csv('/kaggle/working/cleaned_grpo_csv.csv',index=False)
print(df.head())

                                            Question  \
0  Explain the matching principle and its importa...   
1  What is the difference between FIFO and LIFO i...   
2  How does the Sarbanes-Oxley Act impact corpora...   
3  Explain the concept of present value and its a...   
4  What are the key differences between financial...   

                                              Answer  
0  <reasoning>The matching principle in accountin...  
1  <reasoning>FIFO (First-In-First-Out) assumes t...  
2  <reasoning>[EMPTY ANSWER]</reasoning><answer>[...  
3  <reasoning>Present value is the current worth ...  
4  <reasoning>Financial accounting focuses on rep...  


In [6]:
SYSTEM_PROMPT="""
You MUST format responses EXACTLY like this:

<reasoning>
[Your detailed analysis here. Minimum 3 sentences]
</reasoning>

<answer>
[Your concise final answer here. Exactly 1 sentence]
</answer>

FAILURE EXAMPLE (DO NOT DO THIS):
The revenue recognition principle states...

SUCCESS EXAMPLE:
<reasoning>
Revenue recognition principles dictate that income should be recorded when...
</reasoning>
<answer>
Revenue is recognized when earned and realizable.</answer>
"""


In [8]:
from datasets import Dataset
def get_questions_and_answers_dataset(csv_file_path='/kaggle/input/dataset/grpo_csv.csv',max_samples=None)->Dataset:
    df = pd.read_csv(csv_file_path,encoding='ISO-8859-1')

    df['Question']=df['Question'].fillna('[EMPTY QUESTION]').astype(str)
    df['Answer'] = df['Answer'].fillna('<reasoning>[EMPTY ANSWER]</reasoning><answer>[NO ANSWER]</answer>').astype(str)
    
    #limiting samples
    if max_samples is not None and max_samples<len(df):
        df=df.head(max_samples)

    data_list=[]
    for _, row in df.iterrows():
        prompt = [
            {'role':'system','content':SYSTEM_PROMPT},
            {'role': 'user','content':row['Question']}
        ]
        #reason.  and answer tag format is used for formatted answer variable
        formatted_answer= row['Answer']

        data_list.append({
            'prompt':prompt,
            'answer':formatted_answer
        })

    return Dataset.from_list(data_list)

In [9]:
dataset = get_questions_and_answers_dataset('/kaggle/input/dataset/grpo_csv.csv', max_samples=100)  
print(dataset[0])

{'prompt': [{'content': '\nYou MUST format responses EXACTLY like this:\n\n<reasoning>\n[Your detailed analysis here. Minimum 3 sentences]\n</reasoning>\n\n<answer>\n[Your concise final answer here. Exactly 1 sentence]\n</answer>\n\nFAILURE EXAMPLE (DO NOT DO THIS):\nThe revenue recognition principle states...\n\nSUCCESS EXAMPLE:\n<reasoning>\nRevenue recognition principles dictate that income should be recorded when...\n</reasoning>\n<answer>\nRevenue is recognized when earned and realizable.</answer>\n', 'role': 'system'}, {'content': 'Explain the matching principle and its importance in financial reporting.', 'role': 'user'}], 'answer': "<reasoning>The matching principle in accounting requires recording expenses in the same period as the revenues they help generate. For example, depreciation for a machine should be recorded in the accounting period it was used to produce goods. This principle ensures that financial statements accurately reflect the company's performance and prevents

In [10]:
def extract_xml_answer_from_output(text: str) -> str:
    """
    Extracts the content between <answer> tags from the model's output.
    """
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()
def accounting_terminology_reward(completions, **kwargs) -> list[float]:
    """Rewards responses that use appropriate accounting terminology."""
    key_terms=['liability', 'asset', 'equity', 'revenue', 'expense', 'balance sheet', 
                 'income statement', 'cash flow', 'gaap', 'ifrs', 'depreciation', 
                 'amortization', 'accrual', 'audit', 'financial statement', 'inventory']
    
    rewards=[]
    for completion in completions:
        content=completion[0]['content'].lower()
        reward=sum(term in content for term in key_terms) * 0.1
        rewards.append(min(reward, 0.5))
    
    return rewards


def semantic_correctness_reward(prompts,completions,answer, **kwargs) -> list[float]:
    """Quick semantic correctness approximation using word overlap."""
    rewards=[]
    for completion,expected_ans in zip(completions,answer):
        generated=set(extract_xml_answer_from_output(completion[0]['content']).lower().split())
        expected=set(extract_xml_answer_from_output(expected_ans).lower().split())

        if not generated or not expected:
            rewards.append(0.0)
            continue

        overlap=generated.intersection(expected)
        score=len(overlap)/len(expected)
        
        rewards.append(min(score, 1.0) * 2.0)
    
    return rewards

# def accounting_principle_reward(prompts, completions, **kwargs) -> list[float]:
#     """Efficient principle check with simplified logic."""
#     principles = {
#         'matching': ['matching principle', 'expense recognition', 'accrual'],
#         'revenue recognition': ['revenue recognition', 'realized', 'earned'],
#         'conservatism': ['conservatism', 'prudence', 'lower of cost'],
#         'materiality': ['materiality', 'significant', 'threshold'],
#         'going concern': ['going concern', 'operational continuity'],
#         'consistency': ['consistency', 'comparable', 'uniform application']
#     }
    
#     rewards = []
#     for prompt, completion in zip(prompts, completions):
#         question = prompt[-1]['content'].lower()
#         response = completion[0]['content'].lower()

#         #Default reward if no direct match
#         reward = 0.3

#         for principle, terms in principles.items():
#             if any(term in question for term in terms):
#                 if any(term in response for term in terms):
#                     reward = 0.5  #immediate full reward on matching principles
#                     break
#                 else: 
#                     reward = 0.0  #clear negative reward on missing principle
#                     break

#         rewards.append(reward)

#     return rewards

In [11]:
def accounting_principle_reward(prompts, completions, **kwargs) -> list[float]:
    principles = {
        'matching': ['matching principle', 'expense recognition', 'accrual', 'period costs', 'cost matching'],
        'revenue recognition': ['revenue recognition', 'realized', 'earned', 'performance obligation', 'ASC 606', 'IFRS 15'],
        'conservatism': ['conservatism', 'prudence', 'lower of cost', 'asset impairment', 'loss contingency'],
        'materiality': ['materiality', 'significant', 'threshold', 'omission impact', 'professional judgment'],
        'going concern': ['going concern', 'operational continuity', 'liquidity risk', 'bankruptcy risk'],
        'consistency': ['consistency', 'comparable', 'uniform application', 'accounting policy'],
        'historical cost': ['historical cost', 'original cost', 'acquisition cost', 'historical basis'],
        'fair value': ['fair value', 'market value', 'mark-to-market', 'ASC 820', 'IFRS 13'],
        'full disclosure': ['full disclosure', 'footnote disclosure', 'transparency', 'contingent liability'],
        'economic entity': ['economic entity', 'business entity', 'separate entity', 'consolidation'],
        'inventory valuation': ['FIFO', 'LIFO', 'weighted average', 'net realizable value', 'lower cost'],
        'lease accounting': ['ASC 842', 'IFRS 16', 'right-of-use', 'lease liability', 'operating lease'],
        'impairment': ['impairment loss', 'recoverable amount', 'CGU', 'indefinite-lived assets'],
        'foreign currency': ['functional currency', 'translation adjustment', 'spot rate', 'hedging instrument'],
        'deferred taxes': ['temporary difference', 'deferred tax asset', 'valuation allowance', 'DTL'],
        'segment reporting': ['operating segment', 'ASC 280', 'IFRS 8', 'geographic segment'],
        'earnings management': ['earnings per share', 'EBITDA', 'non-GAAP measures', 'pro forma'],
        'financial instruments': ['FVTPL', 'FVOCI', 'amortized cost', 'credit risk', 'IFRS 9'],
        'business combinations': ['goodwill', 'purchase price allocation', 'contingent consideration', 'IFRS 3'],
        'related parties': ['related party transaction', 'arm\'s length', 'control relationship'],
        'subsequent events': ['subsequent events', 'post-balance sheet events', 'adjusting events']
    }
    
    rewards = []
    for prompt, completion in zip(prompts, completions):
        question = prompt[-1]['content'].lower()
        response = completion[0]['content'].lower()
        reward = 0.3  #base reward
        
        #question-principle alignment first
        for principle, terms in principles.items():
            if any(term in question for term in terms):
                # If question mentions principle, require explicit response
                if any(term in response for term in terms):
                    reward = 0.7  
                    break
                else:
                    reward = 0.1  
                    break
        else:
            #if no specific principle,check implicit coverage
            coverage = sum(
                0.1 for principle, terms in principles.items()
                if any(term in response for term in terms)
            )
            reward = min(reward + coverage, 0.5)  #0.5max for unsolicited stuff
            
        rewards.append(reward)
    
    return rewards


In [12]:
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    full_pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    partial_patterns = [
        (r"<reasoning>.*?</reasoning>", 0.5),  #reasoning blockcheck
        (r"<answer>.*?</answer>", 0.5),#answer block check
    ]
    
    rewards = []
    for completion in completions:
        try:
            if isinstance(completion, list):
                #if list,usefirst element
                if len(completion) > 0 and isinstance(completion[0], dict):
                    text = completion[0].get("content", "")
                else:
                    text = ""
            elif isinstance(completion, dict):
                #if dict,get content
                text = completion.get("content", "")
            else:
                text = ""

            
            #if full xml
            if re.search(full_pattern, text, re.DOTALL):
                rewards.append(1.0)
                continue

            partial_reward = 0.0
            for pattern, reward_value in partial_patterns:
                if re.search(pattern, text, re.DOTALL):
                    partial_reward += reward_value

            rewards.append(partial_reward)
        except (IndexError, KeyError, TypeError) as e:
            print(f"Error processing completion {completion}: {e}")
            rewards.append(0.0)
    
    return rewards


In [13]:
from trl import GRPOConfig, GRPOTrainer
import pandas as pd
training_args = GRPOConfig(
    learning_rate = 2e-4,               
    adam_beta1 = 0.9,                   
    adam_beta2 = 0.99,               
    weight_decay = 0.1,
#gradual learning rate warmup over 10% of training
    lr_scheduler_type = "cosine",      
    optim = "paged_adamw_8bit",   
    logging_steps = 1,             #every step
    bf16 = False,      
    fp16 = True, 
    per_device_train_batch_size =4 ,#due to memory constraints
    gradient_accumulation_steps = 2,    
    num_generations = 6,        
    max_prompt_length = 128,     
    max_completion_length = 96,

    # num_train_epochs = 1,              #commented out in favor of max_steps
    max_steps = 100,                 
    save_steps = 20,                              
    report_to = "none",          
    output_dir = "outputs", 
    ddp_find_unused_parameters = False,
    tf32 = False,  #T4 doesn't support TF32
    dataloader_num_workers = 2,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 4 to the `num_generations` of 6


In [None]:
# class DebugGRPOTrainer(GRPOTrainer):
#     def training_step(self, model, inputs, return_outputs=False):
#         # Call the original training step
#         loss = super().training_step(model, inputs, return_outputs)

#         if self.state.global_step % 5 == 0:  # Every 5 steps, print debug info
#             print(f"\n--- Training Step {self.state.global_step} ---")
            
#             # Check that inputs is a list and attempt to access the completions.
#             if isinstance(inputs, list) and len(inputs) > 0 and 'completions' in inputs[0]:
#                 completions = inputs[0]['completions']
                
#                 # Print each completion’s content to examine its structure
#                 for i, completion in enumerate(completions[:5]):  # Print up to first 5
#                     if isinstance(completion, dict):
#                         print(f"Completion {i}: {repr(completion.get('content', ''))}")
#                     else:
#                         print(f"Completion {i} (unexpected type): {repr(completion)}")
#                     print("-" * 50)
                
#                 # Now, apply the reward function; it will print each processed text.
#                 strict_rewards = strict_format_reward_func(completions)
#                 print(f"Strict Format Rewards: {strict_rewards}")
#             else:
#                 print("WARNING: 'completions' key not found in inputs or inputs is empty.")

#         return loss


In [14]:
print(dataset[0])


{'prompt': [{'content': '\nYou MUST format responses EXACTLY like this:\n\n<reasoning>\n[Your detailed analysis here. Minimum 3 sentences]\n</reasoning>\n\n<answer>\n[Your concise final answer here. Exactly 1 sentence]\n</answer>\n\nFAILURE EXAMPLE (DO NOT DO THIS):\nThe revenue recognition principle states...\n\nSUCCESS EXAMPLE:\n<reasoning>\nRevenue recognition principles dictate that income should be recorded when...\n</reasoning>\n<answer>\nRevenue is recognized when earned and realizable.</answer>\n', 'role': 'system'}, {'content': 'Explain the matching principle and its importance in financial reporting.', 'role': 'user'}], 'answer': "<reasoning>The matching principle in accounting requires recording expenses in the same period as the revenues they help generate. For example, depreciation for a machine should be recorded in the accounting period it was used to produce goods. This principle ensures that financial statements accurately reflect the company's performance and prevents

In [15]:
import os
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model = model,          #lora model
    processing_class = tokenizer, 
    reward_funcs = [             
        strict_format_reward_func,       #weight: 0.5
        #domain-specific rewards
        accounting_terminology_reward,   #0.5 
        accounting_principle_reward,     
        semantic_correctness_reward,  
    ],
    args = training_args,    #config
    train_dataset = dataset,       
)
trainer.model.config.use_cache = False


In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 90 | Num Epochs = 5 | Total steps = 100
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 2 x 1) = 24
 "-____-"     Trainable parameters = 167,772,160/8,000,000,000 (2.10% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / strict_format_reward_func,rewards / accounting_terminology_reward,rewards / accounting_principle_reward,rewards / semantic_correctness_reward
1,0.0,1.594266,0.106997,96.0,0.0,0.0,0.141667,0.508333,0.944266
2,0.0,1.46713,0.231284,96.0,0.0,0.020833,0.158333,0.408333,0.87963
3,0.0,1.676187,0.118806,96.0,0.000113,0.0,0.266667,0.5,0.909521
4,0.0001,1.497996,0.182587,96.0,0.002443,0.0,0.245833,0.3375,0.914663
5,0.0003,1.430808,0.204487,96.0,0.006771,0.0625,0.158333,0.358333,0.851641
6,0.001,1.168616,0.197825,96.0,0.023991,0.125,0.091667,0.375,0.57695
7,0.0022,1.955703,0.171064,96.0,0.05556,0.041667,0.179167,0.516667,1.218203
8,0.0033,1.425316,0.274012,95.125,0.0831,0.229167,0.158333,0.366667,0.671149
9,0.0079,2.033202,0.145196,83.708336,0.198155,0.979167,0.1,0.433333,0.520702
10,0.0159,1.967533,0.224711,61.25,0.398738,0.979167,0.0625,0.333333,0.592533


TrainOutput(global_step=100, training_loss=0.019174884571045256, metrics={'train_runtime': 10166.0067, 'train_samples_per_second': 0.236, 'train_steps_per_second': 0.01, 'total_flos': 0.0, 'train_loss': 0.019174884571045256})

In [23]:
trainer.save_model("grpo_finetuned_lora")  


In [None]:
trainer.save_model("outputs/fine_tuned_model")

In [30]:
!zip -r /kaggle/working/final_model.zip /kaggle/working/final_model


  adding: kaggle/working/final_model/ (stored 0%)
  adding: kaggle/working/final_model/training_args.json (deflated 63%)
  adding: kaggle/working/final_model/README.md (deflated 66%)
  adding: kaggle/working/final_model/adapter_config.json (deflated 56%)
  adding: kaggle/working/final_model/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/final_model/special_tokens_map.json (deflated 71%)
  adding: kaggle/working/final_model/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/final_model/tokenizer.json (deflated 85%)
