In [2]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import pandas as pd
from peft import LoraConfig, get_peft_model
import numpy as np

# 1. Load and preprocess the dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Convert conversations to training format
    formatted_data = []
    for item in data:
        conversation = item['conversation']
        full_text = ""
        
        for turn in conversation:
            if turn['role'] == 'human':
                full_text += f"<s>Human: {turn['content']}</s>\n"
            else:
                full_text += f"<s>Assistant: {turn['content']}</s>\n"
        
        # Add risk category and portfolio allocation as context
        risk_category = item['risk_calculation']['risk_category']
        full_text += f"Risk Category: {risk_category}\n"
        
        formatted_data.append({
            "text": full_text.strip()
        })
    
    return Dataset.from_pandas(pd.DataFrame(formatted_data))

# 2. Initialize tokenizer and model
def initialize_model_and_tokenizer():
    model_name = "mistralai/Mistral-7B-Instruct-v0.3"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    return model, tokenizer

# 3. Configure LoRA parameters
def configure_lora(model):
    lora_config = LoraConfig(
        r=16,  # rank
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"]  # Specify layers to fine-tune
    )
    
    model = get_peft_model(model, lora_config)
    return model

# 4. Tokenize dataset
def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length"
        )
    
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    
    return tokenized_dataset

# 5. Training configuration
def setup_training_args():
    return TrainingArguments(
        output_dir="./mistral-finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        warmup_steps=100,
        weight_decay=0.01,
    )

# 6. Main training function
def main():
    # Load dataset
    dataset = load_dataset("investment_profiles.json")
    
    # Initialize model and tokenizer
    model, tokenizer = initialize_model_and_tokenizer()
    
    # Configure LoRA
    model = configure_lora(model)
    
    # Tokenize dataset
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)
    
    # Setup training arguments
    training_args = setup_training_args()
    
    # Initialize data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    
    # Start training
    trainer.train()
    
    # Save the model
    trainer.save_model("./final_model")
    tokenizer.save_pretrained("./final_model")

if __name__ == "__main__":
    main()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\ProgramData\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\ProgramData\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "c:\ProgramData\anaconda3\Lib\site-pack

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
numpy.core.multiarray failed to import