# Kaggle setup

In [20]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install --upgrade --no-cache-dir transformers

# Colab setup

In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

# Load model

In [9]:
from unsloth import FastLanguageModel
import torch
import random
import numpy as np

# based on: https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(1B_and_3B)-Conversational.ipynb

seed = 42069
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

max_seq_length = 2048
dtype = None
load_in_4bit = True

model_og, tokenizer_og = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [10]:
model = FastLanguageModel.get_peft_model(
    model_og,
    r = 16,
    target_modules = [
        "lm_head",
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = seed,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


# Process dataset

In [11]:
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
import pandas as pd

tokenizer = get_chat_template(
    tokenizer_og,
    chat_template = "llama-3.1",
)

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
dataset = Dataset.from_pandas(train_data, preserve_index=False)

In [24]:
class_map = {
    1: "1st",
    2: "2nd",
    3: "3rd"
}

embarkation_map = {
    "C": "Cherbourg",
    "Q": "Queenstown",
    "S": "Southampton",
    "Unknown": "Unknown"
}

def remove_none(d):
    if d is None:
        return "Unknown"
    return str(d)

def data_formatter(data):
    text = ""
    text += "Name: " + data["Name"] + "\n"
    text += "Sex: " + data["Sex"] + "\n"
    text += "Age: " + remove_none(data["Age"]) + "\n"
    text += "Number of siblings and spouses on board: " + str(data["SibSp"]) + "\n"
    text += "Number of parents and children on board: " + str(data["Parch"]) + "\n"
    text += "Ticket class: " + class_map[data["Pclass"]] + "\n"
    text += "Ticket number: " + data["Ticket"] + "\n"
    text += "Passenger fare: " + str(data["Fare"]) + "\n"
    text += "Cabin number: " + remove_none(data["Cabin"]) + "\n"
    text += "Port of embarkation: " + embarkation_map[remove_none(data["Embarked"])] + "\n"

    survived = "Yes" if "Survived" in data and data["Survived"] == 1 else "No"

    system_prompt = { "role": "system", "content": "You are going to predict if people on the Titanic survived or not. Use the information about the person given below to make the prediction. Answer with a \"Yes\" or a \"No\"." }
    user_prompt = { "role": "user", "content": text }
    assistant_prompt = { "role": "assistant", "content": survived }

    prompts = [system_prompt, user_prompt, assistant_prompt]

    texts = tokenizer.apply_chat_template(prompts, tokenize=False, add_generation_prompt=False)
    return { "system_prompt": system_prompt, "user_prompt": user_prompt, "assistant_prompt": assistant_prompt, "text": texts }

In [13]:
dataset = dataset.map(data_formatter)
dataset_split = dataset.train_test_split(test_size=0.2, shuffle=True, seed=seed)
dataset_train = dataset_split["train"]
dataset_val = dataset_split["test"]

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

# Training

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer_og = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        num_train_epochs = 5,
        learning_rate = 5e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = seed,
        output_dir = "outputs",
        report_to = "none",
        group_by_length = True
    )
)

Map (num_proc=2):   0%|          | 0/712 [00:00<?, ? examples/s]

In [21]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer_og,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/712 [00:00<?, ? examples/s]

In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 712 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 110
 "-____-"     Number of trainable parameters = 418,316,288


Step,Training Loss
1,0.7583
2,0.6421
3,0.5743
4,0.6303
5,0.533
6,0.5751
7,0.349
8,0.3067
9,0.2283
10,0.2324


TrainOutput(global_step=110, training_loss=0.1843393361703916, metrics={'train_runtime': 725.6281, 'train_samples_per_second': 4.906, 'train_steps_per_second': 0.152, 'total_flos': 1.1710101065613312e+16, 'train_loss': 0.1843393361703916, 'epoch': 4.808988764044944})

# Validate

In [19]:
from unsloth.chat_templates import get_chat_template
from transformers import TextIteratorStreamer

FastLanguageModel.for_inference(model)

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)

val_accuracy = 0

for i in range(len(dataset_val)):
    messages = [
        dataset_val[i]["system_prompt"],
        dataset_val[i]["user_prompt"],
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")

    model.generate(input_ids=inputs, streamer=streamer, max_new_tokens=128,
                   use_cache=True, do_sample=False)

    generated_text = ""
    for new_text in streamer:
        generated_text += new_text

    model_answer = generated_text[0:generated_text.find("<|eot_id|>")]
    true_answer = dataset_val[i]["assistant_prompt"]["content"]
    print("Iteration:", i, "Model:", model_answer, "True:", true_answer)
    if model_answer == true_answer:
        val_accuracy += 1

val_accuracy /= len(dataset_val)
print("Validation accuracy:", val_accuracy)

Iteration: 0 Model: No True: No
Iteration: 1 Model: No True: No
Iteration: 2 Model: No True: No
Iteration: 3 Model: Yes True: Yes
Iteration: 4 Model: No True: No
Iteration: 5 Model: No True: Yes
Iteration: 6 Model: Yes True: Yes
Iteration: 7 Model: Yes True: Yes
Iteration: 8 Model: No True: No
Iteration: 9 Model: No True: Yes
Iteration: 10 Model: No True: No
Iteration: 11 Model: No True: No
Iteration: 12 Model: No True: No
Iteration: 13 Model: No True: No
Iteration: 14 Model: No True: Yes
Iteration: 15 Model: No True: No
Iteration: 16 Model: No True: No
Iteration: 17 Model: No True: No
Iteration: 18 Model: No True: No
Iteration: 19 Model: No True: No
Iteration: 20 Model: No True: No
Iteration: 21 Model: Yes True: Yes
Iteration: 22 Model: No True: No
Iteration: 23 Model: No True: No
Iteration: 24 Model: Yes True: Yes
Iteration: 25 Model: No True: No
Iteration: 26 Model: Yes True: No
Iteration: 27 Model: No True: No
Iteration: 28 Model: No True: Yes
Iteration: 29 Model: Yes True: Yes
Ite

# Submission

In [27]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
dataset_test = Dataset.from_pandas(test_data, preserve_index=False)

dataset_test = dataset_test.map(data_formatter)
#dataset_test[0]["text"]
dataset_test

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Dataset({
    features: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'system_prompt', 'user_prompt', 'assistant_prompt', 'text'],
    num_rows: 418
})

In [30]:
passenger_ids = []
survived = []

for i in range(len(dataset_test)):
    messages = [
        dataset_test[i]["system_prompt"],
        dataset_test[i]["user_prompt"],
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")

    model.generate(input_ids=inputs, streamer=streamer, max_new_tokens=128,
                   use_cache=True, do_sample=False)

    generated_text = ""
    for new_text in streamer:
        generated_text += new_text

    model_answer = generated_text[0:generated_text.find("<|eot_id|>")]
    print(dataset_test[i]["PassengerId"], model_answer)
    passenger_ids.append(dataset_test[i]["PassengerId"])
    survived.append(1 if model_answer == "Yes" else 0)

submission = pd.DataFrame({"PassengerId": passenger_ids, "Survived": survived})

892 No
893 No
894 No
895 No
896 No
897 No
898 Yes
899 No
900 Yes
901 No
902 No
903 No
904 Yes
905 No
906 Yes
907 Yes
908 No
909 No
910 Yes
911 Yes
912 No
913 Yes
914 Yes
915 No
916 Yes
917 No
918 Yes
919 No
920 No
921 No
922 No
923 No
924 No
925 Yes
926 Yes
927 No
928 Yes
929 Yes
930 No
931 No
932 No
933 No
934 No
935 Yes
936 Yes
937 No
938 No
939 No
940 Yes
941 No
942 Yes
943 No
944 Yes
945 Yes
946 No
947 No
948 No
949 No
950 No
951 Yes
952 No
953 No
954 No
955 Yes
956 Yes
957 Yes
958 Yes
959 No
960 Yes
961 Yes
962 Yes
963 No
964 Yes
965 Yes
966 Yes
967 Yes
968 No
969 Yes
970 No
971 Yes
972 No
973 No
974 No
975 No
976 No
977 No
978 Yes
979 Yes
980 Yes
981 Yes
982 No
983 No
984 Yes
985 No
986 No
987 No
988 Yes
989 No
990 Yes
991 No
992 Yes
993 No
994 No
995 No
996 Yes
997 No
998 No
999 No
1000 No
1001 No
1002 No
1003 Yes
1004 Yes
1005 Yes
1006 Yes
1007 No
1008 No
1009 Yes
1010 No
1011 Yes
1012 Yes
1013 No
1014 Yes
1015 No
1016 No
1017 Yes
1018 No
1019 Yes
1020 No
1021 No
1022 No
1023 N

In [31]:
submission.to_csv("submission.csv", index=False)