In [None]:
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

from unsloth import FastLanguageModel
import torch
from datasets import load_dataset

max_seq_length = 10000
dtype = None  
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    modules_to_save = ["lm_head"],
    lora_alpha = 8,
    lora_dropout = 0.1,
    bias = "all",
    use_gradient_checkpointing = "unsloth",
    use_rslora = True,
    loftq_config = None,
)

print(model)


In [None]:

# Configure tokenizer with Phi-3 chat template
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3",
    mapping = {
        "role": "role",
        "content": "content",
        "user": "user",
        "assistant": "assistant"
    }
)

# Data preparation function
def formatting_prompts_func(examples):
    messages_list = examples["messages"]
    texts = [tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) 
            for messages in messages_list]
    return {"text": texts}

# Load and process datasets
train_file = "data/input/training_data.jsonl"
eval_file = "data/input/validation_data.jsonl"
train_dataset = load_dataset('json', data_files=train_file, split='train')
print(f"Training dataset size: {len(train_dataset)}")
eval_dataset = load_dataset('json', data_files=eval_file, split='train')
print(f"Eval dataset size: {len(eval_dataset)}")


# train_dataset = train_dataset.select(range(200))
# eval_dataset = eval_dataset.select(range(20))

# Format the datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# Training configuration
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 8,
        warmup_steps = 50,
        max_steps = 250,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        eval_steps = 25,
        evaluation_strategy = "steps",
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        max_grad_norm = .3,
        output_dir = "model",
    ),
)

# Training
trainer_stats = trainer.train()

# Enable inference mode
FastLanguageModel.for_inference(model)

# Example inference
test_messages = [
    {"role": "user", "content": """

Read the following document.
After reading the document, describe what it tells us.  
Focus on who did what, where, and when. 
Include specific details about locations, dates, times, events, and names of individuals. 

H\nTermination recommended; Resigned\nTerminated prior to IA findings for failure.\nACTION\nOfficer Ornelas resigned on 12/27/2016\ncollateral duty position.\nAppeal in process.\nTermination recommended; Resigned\nOn 3/10/15, demotion to police officer;\nprobation.\nMosqueda resigned on 1/31/19 prior to\nsuspension from hostage negotiator SWAT\nNotice to Terminate served. Resigned\n160-hour suspension, PDSA served\nTerminated 3/18/15 for failure to pass\nremoval from training officer position;\nNotice to Terminate served 7/19/18;\nResigned 10/14/16 prior to the findings.\n9/15/14\n2/24/16:\n5/1/16.\n10/10/2016.\nto pass probation.\nTermination recommended; Officer\nprior to the completion of this case.\n\u20b2\nMar 20 2015\nMar 20 2015\nFinding Dt\nApr 04.2018\nFeb 5 2019\nSep 10 2014\nDec 02.2015\nOct 05 2016\nF\nFinding\nSustained\nSustained\nSustained\nSustained\nSustained Jun 15 2016\nSustained\nSustained |Oct 20 2016\nSustained.\nSustained |Jan 25 2017\nSustained Feb:02-2015\nSustained\nFalsification of Work-Related\nAllegation\nDishonesty\nDishonesty; False Statements\nDocuments; False Statements\nDishonesty; Falsification of Work-\nOn-Duty Sexual Relations\nDocuments\nRelated Documents\nDestruction of Evidence\nFalse Statements\nFalsification of Work-Related\"\nDishonesty\n|On-Duty Sexual Relations\nD\nOfficer Marc Aguilar [1145]\nOfficer Kevin Schindler [1260]\nOfficer Hillary Bjorneboe [1226]\nOfficer Travis Brewer [1132]\nOfficer Jeremy Salcido [1273]\nOfficer Doug Mansker [843]\nDetective Damacio Diaz [854]\nOfficer Manuel Ornelas [989]\nDetective Justin Lewis [1015]\nOfficer Enrique Mosqueda (1242) |Sexual Solicitation\nSr. Officer Kyle Ursery [969):\nC\nOct 06 2017\nOct 16 2014\nOct 16 2014\nFeb 25 2015\nJun 06 2016\nAug 02 2016\nJan 09 2015\nOct 13 2018\nMay 31 2016\nOct 05 2016\nInc Received Dt Involved Officer\nJun 24 2014\nB\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nInternal\nType\nInternal\nA\nIA2015-006
 """}
]

inputs = tokenizer.apply_chat_template(
    test_messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(
    input_ids = inputs, 
    max_new_tokens = 4096,
    use_cache = True
)

print(tokenizer.batch_decode(outputs)[0])


# Save the model
model.save_pretrained("testy_model")
tokenizer.save_pretrained("testy_model")

In [None]:
# from unsloth import FastLanguageModel
# import torch

# # Initialize device
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Load base model
# base_model, base_tokenizer = FastLanguageModel.from_pretrained(
#     model_name="unsloth/Phi-3.5-mini-instruct",
#     max_seq_length=10000,
#     load_in_4bit=True
# )

# # Enable inference mode for base model
# FastLanguageModel.for_inference(base_model)

# # Configure base tokenizer
# from unsloth.chat_templates import get_chat_template
# base_tokenizer = get_chat_template(
#     base_tokenizer,
#     chat_template="phi-3",
#     mapping={
#         "role": "role",
#         "content": "content",
#         "user": "user",
#         "assistant": "assistant"
#     }
# )

# # Load fine-tuned model - simplified based on documentation
# ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
#     model_name="2k_10k_model",  # Path to your saved model
#     max_seq_length=10000,
#     load_in_4bit=True
# )

# # Enable inference mode for fine-tuned model
# FastLanguageModel.for_inference(ft_model)

# # Configure fine-tuned tokenizer
# ft_tokenizer = get_chat_template(
#     ft_tokenizer,
#     chat_template="phi-3",
#     mapping={
#         "role": "role",
#         "content": "content",
#         "user": "user",
#         "assistant": "assistant"
#     }
# )

# def generate_summary(model, tokenizer, text, max_length=4096):
#     """Generate summary using specified model and tokenizer."""
#     messages = [{"role": "user", "content": f"""
# Summarize each event.

# Below is the document you will review:  {text}"""}]
    
#     inputs = tokenizer.apply_chat_template(
#         messages,
#         tokenize=True,
#         add_generation_prompt=True,
#         return_tensors="pt"
#     ).to(device)
    
#     outputs = model.generate(
#         input_ids=inputs,
#         max_new_tokens=max_length,
#         use_cache=True,
#         temperature=0.7,
#         top_p=0.95,
#         top_k=10,
#         do_sample=True,        
#     )
    
#     return tokenizer.batch_decode(outputs)[0]

# # Example text to summarize

# # Example inference
# input = """

# | IA No | Type | Inc Received Dt | Involved Officer | Allegation | Finding | Finding Dt | ACTION |
# |-------|------|----------------|------------------|------------|---------|------------|---------|
# | IA2014-014 | Internal | Jun 24 2014 | Officer Marc Aguilar [1145] | Dishonesty; False Statements | Sustained | Sep 10 2014 | Termination recommended; Resigned 9/15/14 |
# | IA2014-022 | Internal | Oct 16 2014 | Officer Hillary Bjorneboe [1226] | Falsification of Work-Related Documents | Sustained | Mar 20 2015 | Terminated 3/18/15 for failure to pass probation. |
# | IA2014-022 | Internal | Oct 16 2014 | Officer Travis Brewer [1132] | Dishonesty; Falsification of Work-Related Documents | Sustained | Mar 20 2015 | Termination recommended; Resigned 5/1/16. |
# | IA2015-002 | Internal | Jan 09 2015 | Sr. Officer Kyle Ursery [969] | On-Duty Sexual Relations | Sustained | Feb 02 2015 | On 3/10/15, demotion to police officer; removal from training officer position; suspension from hostage negotiator SWAT collateral duty position. |
# | IA2015-006 | Internal | Feb 25 2015 | Detective Damacio Diaz [854] | Falsification of Work-Related Documents; False Statements | Sustained | Dec 02 2015 | Notice to Terminate served. Resigned 2/24/16. |
# | IA2016-008 | Internal | May 31 2016 | Officer Doug Mansker [843] | Destruction of Evidence | Sustained | Oct 05 2016 | 160-hour suspension, PDSA served 10/10/2016. |
# | IA2016-011 | Internal | Jun 06 2016 | Officer Jeremy Salcido [1273] | Dishonesty | Sustained | Jun 15 2016 | Terminated prior to IA findings for failure to pass probation. |
# | IA2016-016 | Internal | Aug 02 2016 | Detective Justin Lewis [1015] | False Statements | Sustained | Oct 20 2016 | Resigned 10/14/16 prior to the findings. |
# | IA2016-022 | Internal | Oct 05 2016 | Officer Manuel Ornelas [989] | On-Duty Sexual Relations | Sustained | Jan 25 2017 | Officer Ornelas resigned on 12/27/2016 prior to the completion of this case. |
# | IA2017-011 | Internal | Oct 06 2017 | Officer Kevin Schindler [1260] | Dishonesty | Sustained | Apr 04 2018 | Notice to Terminate served 7/19/18; Appeal in process. |
# | IA2018-017 | Internal | Oct 13 2018 | Officer Enrique Mosqueda [1242] | Sexual Solicitation | Sustained | Feb 5 2019 | Termination recommended; Officer Mosqueda resigned on 1/31/19 prior to |

# """


# # Generate summaries from both models
# print("Base Model Summary:")
# base_summary = generate_summary(base_model, base_tokenizer, input)
# print(base_summary)
# print("\n" + "="*50 + "\n")
# print("Fine-tuned Model Summary:")
# ft_summary = generate_summary(ft_model, ft_tokenizer, input)
# print(ft_summary)