In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/current-dataset/Karengata SDA Bot Data/Testimonies for the Church Vol 7.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Early Writings.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Testimonies for the Church Vol 6.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/The Spirit of Prophecy Vol 4.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Spiritual Gifts vol 1.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Seventh-day Adventists Believe. . . Marriage and the Family_ 27-22.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/The Voice in Speech and Song.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Christian Leadership.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/SDA Bible Commentary Vol 5.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Testimonies for the Church Vol 1.pdf
/kaggle/input/current-dataset/Karengata SDA Bot Data/Seventh-day Adventists Believe... Creation_ 27-06.pdf
/kaggle/input/

**Setting up**

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("DeepSeek")

In [3]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [4]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret('DeepSeek')
login(hf_token)

**Loading the model and tokenizer**

In [6]:
from unsloth import FastLanguageModel

max_seq_length = 2048 
dtype = None 
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, 
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.9: Fast Llama patching. Transformers: 4.53.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


**Model inference before fine-tuning**

In [7]:
prompt_style = """"Below is an instruction that describes a task, paired with an input that provides further context.  
Write a response that appropriately completes the request.  
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.  

### Instruction:  

### Question:  
{}  

### Response:  
<think>{}"""

In [8]:
question = "Our church is experiencing increased tension and misalignment under the weight of ministry demands, growth, or conflicts. What are the root causes, and how can we prayerfully diagnose and address these issues to restore harmony and mission focus?"

FastLanguageModel.for_inference(model)  
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  

outputs = model.generate(  
    input_ids=inputs.input_ids,  
    attention_mask=inputs.attention_mask,  
    max_new_tokens=1200,  
    use_cache=True,  
)  
response = tokenizer.batch_decode(outputs)  
print(response[0].split("### Response:")[1])

  
<think>
Okay, so I'm trying to help our church figure out why we're having so much tension and misalignment. The user mentioned that the issues could be due to ministry demands, growth, or conflicts. I need to think through the possible root causes and then come up with a prayerful approach to diagnose and address these problems.

First, I should consider what could be causing the tension. Maybe the church is growing quickly, which can bring both excitement and challenges. Growth often means more people, more programs, and more demands on leadership. If the church isn't keeping up with the growth, that could lead to misalignment. People might feel overwhelmed or like they're not being heard, leading to conflicts.

Another possible cause is ministry demands. The church might be taking on too many projects or programs without enough planning or resources. This could lead to burnout among volunteers and staff, causing frustration and misalignment with the church's mission.

Conflicts t

**Loading and processing the dataset**

In [9]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.  
Write a response that appropriately completes the request.  
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.  

### Instruction:  
You are a wise and spiritually grounded advisor with deep knowledge of Seventh-day Adventist beliefs, biblical principles, and church life. Your responses reflect the teachings of Scripture, the writings of Ellen G. White, and the mission of the Adventist Church—emphasizing faith, discipleship, community, and practical Christian living.
Please provide thoughtful, Scripture-based answers to the following question, offering compassionate guidance that strengthens faith, fosters unity, and aligns with Adventist values.  

### Question:  
{}  

### Response:  
<think>  
{}  
</think>  
{}"""

In [12]:
!pip install --quiet pdfplumber

In [13]:
from datasets import load_dataset

# Load dataset from uploaded zip (Word/pdf/etc) — adjust path if needed
dataset = load_dataset("/kaggle/input/current-dataset", split="train[:500]", trust_remote_code=True)

# Check column names
print(dataset.column_names)

Resolving data files:   0%|          | 0/164 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/164 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

['pdf']


In [23]:
import io
import pdfplumber

def extract_pdf_text(batch):
    texts = []
    for file_obj in batch["pdf"]:
        try:
            # If it's a PDF object with a path
            with open(file_obj.path, "rb") as f:
                with pdfplumber.open(f) as pdf:
                    full_text = "\n".join(page.extract_text() or "" for page in pdf.pages)
                    texts.append(full_text.strip())
        except Exception as e:
            print("PDF read error:", e)
            texts.append("")
    return {"text": texts}

# Apply the fixed extraction function
dataset = dataset.map(extract_pdf_text, batched=True, batch_size=1)


Map:   0%|          | 0/164 [00:00<?, ? examples/s]

PDF read error: 'float' object is not iterable


In [29]:
# Print the first example's text
print(dataset[0]["text"][:10])  # Just a preview

BOOKS / CM


**setting up the model & Model Training**

In [32]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import logging

# Optional: track where we’re at
logging.basicConfig(level=logging.INFO)

# ✅ Try smaller sequence length for faster training test
max_seq_length = 512  # If it works, increase to 1024 or 2048

# ✅ Reduced steps for sanity test
# Try max_steps = 5 to see if it even progresses
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=5,  # <----- only train for 5 steps for debugging
    warmup_steps=2,
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_steps=1,  # <--- show progress every step
    logging_dir="./logs",
    optim="adamw_8bit",
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    seed=3407,
    report_to="none",  # Avoid wandb interference
)

# ✅ Create the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=training_args,
)

# ✅ Sanity check: Log dataset size
print(f"Loaded {len(dataset)} training samples.")
print("Starting training...")

# ✅ Train and time it
import time
start = time.time()
trainer.train()
end = time.time()

print(f"\n✅ Training completed in {(end - start):.2f} seconds.")


Unsloth: Tokenizing ["text"]:   0%|          | 0/164 [00:00<?, ? examples/s]

Loaded 164 training samples.
Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 164 | Num Epochs = 1 | Total steps = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.4816
2,2.5124
3,2.3625
4,2.1967
5,2.2792



✅ Training completed in 260.84 seconds.


**Model Inference after Fine-tuning**

In [33]:
question = "Our church is experiencing increased tension and misalignment under the weight of ministry demands, growth, or conflicts. What are the root causes, and how can we prayerfully diagnose and address these issues to restore harmony and mission focus?"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


  
<think>
Okay, so I'm trying to help someone who's dealing with tension and misalignment in their church. Let me break this down. The church is feeling the strain because of ministry demands, growth, or conflicts. I need to figure out what's causing these issues and how to address them prayerfully to bring back harmony and focus on the mission.

First, I should think about what could be causing the tension. Maybe it's growth bringing in new people with different ideas or values. That can create friction. Or maybe the existing members are stressed by increased responsibilities. Another possibility is conflicts among leadership or congregation members, which can really divide a church.

Once I identify the root causes, I need to figure out how to address them prayerfully. That means seeking God's guidance through prayer, maybe also involving the leadership team. Perhaps there are underlying issues that require spiritual discernment or professional help, like counseling or mediation.

R

**Saving the model locally**

In [34]:
new_model_local = "Karengata SDA Faithpod RAG"
model.save_pretrained(new_model_local) 
tokenizer.save_pretrained(new_model_local)

model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/deepseek-r1-distill-llama-8b...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [01:00<03:02, 60.88s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:45<01:42, 51.07s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [02:27<00:46, 46.92s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [02:46<00:00, 41.65s/it]


**Pushing the model to Hugging Face Hub**

In [35]:
new_model_online = "sammyjoseph/Karengata_SDA_Faithpod_RAG"
model.push_to_hub(new_model_online)
tokenizer.push_to_hub(new_model_online)

model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

README.md:   0%|          | 0.00/630 [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/sammyjoseph/Karengata_SDA_Faithpod_RAG


Uploading...:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth: Saving to sammyjoseph/Karengata_SDA_Faithpod_RAG will fail, but using a temp folder works! Switching to a temp folder then uploading!


Uploading...:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/deepseek-r1-distill-llama-8b...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [02:09<06:29, 129.99s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [04:18<04:18, 129.21s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [06:21<02:06, 126.20s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [07:13<00:00, 108.34s/it]
