In [None]:
# Cell 1: Install Dependencies
!pip install -q transformers datasets peft accelerate bitsandbytes
!pip install nltk
!pip install fsspec==2025.3.0  # Match gcsfs requirement

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Cell 2: Import Libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    get_cosine_schedule_with_warmup
)

from torch.optim import AdamW
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
# Cell 3: Hugging Face Login
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Cell 4: Load Model & Tokenizer
model_name = "meta-llama/Llama-3.2-3B"  # Replace with your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def split_text_into_complete_thoughts(text, max_chars=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        # Check if adding the next sentence would exceed the max_chars limit
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk = f"{current_chunk} {sentence}".strip() if current_chunk else sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    # Append any leftover text as the final chunk
    if current_chunk:
        chunks.append(current_chunk)
    return chunks



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Cell 5: Load Dataset
from datasets import Dataset
import os

# Get all .txt files in the content folder
text_files = [f for f in os.listdir("/content") if f.endswith(".txt")]

all_text_chunks = []

# Iterate through each text file
for file_name in text_files:
    file_path = os.path.join("/content", file_name)
    with open(file_path, "r") as f:
        text = f.read()

    # Use the new function to split text into complete thoughts
    text_chunks = split_text_into_complete_thoughts(text, max_chars=512)
    all_text_chunks.extend(text_chunks)

# Create dataset
dataset = Dataset.from_dict({"text": all_text_chunks})
split_dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Cell 6: Tokenization
def tokenize_text(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        add_special_tokens=True
    )

tokenized_dataset = split_dataset.map(
    tokenize_text,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/10743 [00:00<?, ? examples/s]

Map:   0%|          | 0/1194 [00:00<?, ? examples/s]

In [None]:
# Cell 7: LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, lora_config)

In [None]:
# Cell 8: Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

In [None]:
# Cell 9: Training Arguments



training_args = TrainingArguments(
    output_dir="buffett_unsup_lora",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=100,
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="no",
    fp16=True,
    learning_rate=2e-4,
)

optimizer = AdamW(lora_model.parameters(), lr=training_args.learning_rate)
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=10
)



In [None]:
# Cell 10: Initialize Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Cell 11: Start Training

# Start Training
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maalpuertoiii[0m ([33maalpuertoiii-rensselaer-polytechnic-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.8631,2.868359


TrainOutput(global_step=670, training_loss=2.889163057127995, metrics={'train_runtime': 1404.3354, 'train_samples_per_second': 15.3, 'train_steps_per_second': 0.477, 'total_flos': 1.8572615555638886e+17, 'train_loss': 2.889163057127995, 'epoch': 1.9947877885331349})

In [None]:
# Cell 12: Save Model
trainer.model.save_pretrained("llama_buffett_unsup_lora")
tokenizer.save_pretrained("llama_buffett_unsup_lora")
# Define the model repository id on Hugging Face Hub
# model_repo_id = "davidgpt1/buffet-finetuned-model"
use_auth_token=HF_WRITE_TOKEN
model_repo_id = "Aabe03/llama_buffett_unsup_lora"
# Push the model and tokenizer to the Hugging Face Hub using the write token
trainer.model.push_to_hub(model_repo_id, use_auth_token=HF_WRITE_TOKEN)
tokenizer.push_to_hub(model_repo_id, use_auth_token=HF_WRITE_TOKEN)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Aabe03/llama_buffett_unsup_lora/commit/37f1f7bc9960178d9793501a81fd8eceba365262', commit_message='Upload tokenizer', commit_description='', oid='37f1f7bc9960178d9793501a81fd8eceba365262', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Aabe03/llama_buffett_unsup_lora', endpoint='https://huggingface.co', repo_type='model', repo_id='Aabe03/llama_buffett_unsup_lora'), pr_revision=None, pr_num=None)

In [None]:
#Qualititative Testing - Text Generation
from transformers import pipeline
from peft import PeftModel

torch.cuda.empty_cache()         # Releases cached memory back to the GPU pool
torch.cuda.ipc_collect()         # Collects inter-process communication memory

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    torch_dtype=torch.float16
)

base_model.to("cuda")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "llama_buffett_unsup_lora")
model = model.merge_and_unload()  # Merge LoRA weights


# Create text generation pipeline
# Removed the 'device' argument
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
prompts = [
    "Write in 3 sentances. Warren Buffett's investment strategy emphasizes",
    "Write in 3 sentances. The key to value investing is",
    "Write in 3 sentances. Warren Buffett thinks of crypto as"
]

from transformers import StoppingCriteria, StoppingCriteriaList

class ThreeSentenceStoppingCriteria(StoppingCriteria):
    def __call__(self, input_ids, scores, **kwargs):
        # Decode the generated tokens so far
        output = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        # Count periods as a simple proxy for sentence ends
        if output.count('.') >= 5:
            return True
        return False

# Create a stopping criteria list with our custom criteria
stopping_criteria = StoppingCriteriaList([ThreeSentenceStoppingCriteria()])

# Use the stopping criteria in your text generation call
for prompt in prompts:
    print(f"Prompt: {prompt}")
    output = generator(
        prompt,
        max_length=500,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        num_return_sequences=1,
        stopping_criteria=stopping_criteria
    )
    print(f"Response: {output[0]['generated_text']}\n{'-'*50}")


Prompt: Write in 3 sentances. Warren Buffett's investment strategy emphasizes
Response: Write in 3 sentances. Warren Buffett's investment strategy emphasizes three key points: (1) Don't look for a get-rich-quick scheme. (2) Be wary of the financial wizard, especially one who sells you an idea that costs money to buy and that gives you no control over its execution. (3) Beware of the business with the good prospects. The worst businesses are sometimes run by competent people and can become excellent investments if they're bought cheap enough.
--------------------------------------------------
Prompt: Write in 3 sentances. The key to value investing is
Response: Write in 3 sentances. The key to value investing is that you buy stocks with high intrinsic values at prices low compared to those values. When a stock is priced well below intrinsic value, the buyer can make a substantial profit by holding the stock until its price increases toward its intrinsic value. Value investors seek out b

In [None]:
#Quantitative Testing - Perplexity (Language Modeling Quality)
import numpy as np
from tqdm import tqdm # Import tqdm

def calculate_perplexity(model, dataset):
    model.eval()
    losses = []

    for text in tqdm(dataset["test"]["text"]):
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        losses.append(outputs.loss.item())

    return np.exp(np.mean(losses))

# Compare with base model
base_perplexity = calculate_perplexity(base_model, split_dataset)
lora_perplexity = calculate_perplexity(model, split_dataset)

print(f"Base Model Perplexity: {base_perplexity:.2f}")
print(f"LoRA-Tuned Perplexity: {lora_perplexity:.2f}")

100%|██████████| 1191/1191 [00:44<00:00, 26.74it/s]
100%|██████████| 1191/1191 [00:44<00:00, 26.82it/s]

Base Model Perplexity: 21.25
LoRA-Tuned Perplexity: 21.25





In [None]:
# Check if LoRA weights were applied (indirectly)
# This will show changes in the base model's weights where LoRA was applied
print(model.state_dict()['model.layers.0.self_attn.q_proj.weight'])

tensor([[-0.0338,  0.0285,  0.0613,  ...,  0.0858, -0.0340, -0.0188],
        [-0.0116, -0.0241,  0.0433,  ...,  0.0543, -0.0004, -0.0339],
        [ 0.0207,  0.0169,  0.0324,  ...,  0.0516, -0.0416, -0.0121],
        ...,
        [ 0.0002, -0.0060,  0.0710,  ..., -0.0122,  0.0071,  0.0098],
        [ 0.0046, -0.0110,  0.0212,  ..., -0.0129, -0.0183,  0.0080],
        [-0.0062,  0.0232, -0.0076,  ...,  0.0089,  0.0035, -0.0103]],
       device='cuda:0', dtype=torch.float16)


In [None]:
# # Install requirements
# !pip install -q transformers datasets peft accelerate bitsandbytes

# import torch
# from transformers import (
#     AutoTokenizer,
#     AutoModelForCausalLM,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForLanguageModeling
# )
# from datasets import load_dataset
# from peft import LoraConfig, get_peft_model

# # Load tokenizer and base model
# model_name = "meta-llama/Llama-3.2-3B"  # Replace with your model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     load_in_4bit=True,  # For 4-bit quantization
#     device_map="auto",
#     torch_dtype=torch.float16
# )

# # Load and prepare dataset
# dataset = load_dataset("text", data_files="the-warren-buffet-way.txt")["train"]
# split_dataset = dataset.train_test_split(test_size=0.1)

# # Tokenization function for unsupervised learning
# def tokenize_text(example):
#     return tokenizer(
#         example["text"],
#         truncation=True,
#         max_length=512,
#         padding="max_length",
#         add_special_tokens=True
#     )

# tokenized_dataset = split_dataset.map(
#     tokenize_text,
#     batched=True,
#     remove_columns=["text"]
# )

# # Configure LoRA
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, lora_config)

# # Data collator for language modeling
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False  # Causal language modeling
# )

# # Training arguments
# training_args = TrainingArguments(
#     output_dir="llm-unsupervised-lora",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     evaluation_strategy="steps",
#     eval_steps=500,
#     logging_steps=100,
#     save_strategy="steps",
#     save_steps=1000,
#     learning_rate=3e-4,
#     fp16=True,
#     warmup_ratio=0.1,
#     report_to="none",
#     optim="paged_adamw_8bit"
# )

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     data_collator=data_collator
# )

# # Start training
# trainer.train()

# # Save model
# model.save_pretrained("unsupervised-llm")
# tokenizer.save_pretrained("unsupervised-llm")