# Creating Dataset

In [None]:
!pip install google-generativeai



In [None]:
import os
import time
import json
import pandas as pd
import google.generativeai as genai

# Configure Gemini API
genai.configure(api_key="AIzaSyAHX6Zl-x5iNQQnGnWtjLxYJ6VTdkq0Zfo")
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
gemini_model = genai.GenerativeModel(model_name="gemini-2.0-flash", generation_config=generation_config)

# Paths
csv_path = "pmh_part_1.csv"
json_path = "mental_health_dataset17.json"

# Check if JSON file exists and load existing data
if os.path.exists(json_path):
    with open(json_path, "r") as f:
        dataset = json.load(f)
else:
    dataset = []

# Read CSV
df = pd.read_csv(csv_path)

# Process first 10 records for testing
# df_sample = df.head(10)
df_sample = df[1601:1700]

# Change this line to process the entire dataset
# df_sample = df

def generate_insight(text, issue):
    # Construct prompt
    prompt = f"Analyze the following mental health issue: {issue}\nText: {text}\nProvide wellbeing insights based on the Ryff Scale of Psychological Wellbeing (Autonomy, Environmental Mastery, Personal Growth, Positive Relations, Purpose in Life, Self-Acceptance). The response should be of 1024 characters or less covering practical advice for all the 6 paramters"

    # Generate response
    response = gemini_model.generate_content([prompt])
    return response.text.strip()

def create_json_dataset():
    for index, row in df_sample.iterrows():
        text = row.get('text', '')
        issue = row.get('mental_health_issue', '')

        # Skip if entry already exists in JSON
        if any(entry['text'] == text and entry['mental_health_issue'] == issue for entry in dataset):
            print(f"Skipping record {index + 1}: Already present in JSON")
            continue

        try:
            # Generate wellbeing insight
            insight = generate_insight(text, issue)

            print("----------------------------------\n")
            print(f"Processed record {index + 1}:")
            print(f"Text: {text}")
            print(f"Mental Issue: {issue}")
            print(f"Wellbeing Insight: {insight}")
            print("----------------------------------\n")

            # Append to dataset
            dataset.append({
                "text": text,
                "mental_issue": issue,
                "wellbeing_insight": insight
            })

            # Save to JSON
            with open(json_path, "w") as f:
                json.dump(dataset, f, indent=4)

            print(f"✅ Record {index + 1} appended to JSON")

        except Exception as e:
            print(f"❌ Error on record {index + 1}: {e}")

        # Wait for 5 seconds to handle rate limits
        print(f"⏳ Waiting for 10 seconds before processing the next record...")
        time.sleep(10)

create_json_dataset()


# Combining Dataset

In [None]:
import os
import json
from glob import glob

# Paths
json_folder = "./"  # Folder containing JSON files
output_path = "combined_mental_health_dataset.json"

# Collect all JSON file paths
json_files = glob(os.path.join(json_folder, "*.json"))

# Final dataset list
combined_dataset = []

# Read and combine JSON files
for file in json_files:
    with open(file, "r") as f:
        data = json.load(f)
        if isinstance(data, list):  # Ensure data is a list
            combined_dataset.extend(data)
        else:
            print(f"⚠️ Skipping {file} as it does not contain a valid JSON list.")

# Remove duplicate entries based on text and mental_issue
unique_dataset = []
seen_entries = set()

for entry in combined_dataset:
    identifier = (entry.get("text", ""), entry.get("mental_issue", ""))
    if identifier not in seen_entries:
        seen_entries.add(identifier)
        unique_dataset.append(entry)

# Save combined JSON
with open(output_path, "w") as f:
    json.dump(unique_dataset, f, indent=4)

print(f"✅ Combined JSON file created: {output_path}")

# Converting it into records with instruction, input output format

In [None]:
import json
import re

def clean_text(text):
    # Keep only English letters, numbers, basic punctuation, and whitespace
    cleaned = re.sub(r"[^a-zA-Z0-9\s.,?!]", "", text)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()  # Normalize whitespace
    return cleaned

# Load the original records
with open('combined_mental_health_dataset.json', 'r') as f:
    records = json.load(f)

instruction_data = []
for idx, record in enumerate(records, start=1):
    cleaned_text = clean_text(record["text"])

    new_record = {
        "instruction": f"Provide wellbeing insight for the below text with {record['mental_issue']}.",
        "input": cleaned_text,
        "output": record["wellbeing_insight"]
    }
    instruction_data.append(new_record)
    print(f"Converted record #{idx}")

# Save the transformed data
with open('instruction_data.json', 'w') as f:
    json.dump(instruction_data, f, indent=4)


Converted record #1
Converted record #2
Converted record #3
Converted record #4
Converted record #5
Converted record #6
Converted record #7
Converted record #8
Converted record #9
Converted record #10
Converted record #11
Converted record #12
Converted record #13
Converted record #14
Converted record #15
Converted record #16
Converted record #17
Converted record #18
Converted record #19
Converted record #20
Converted record #21
Converted record #22
Converted record #23
Converted record #24
Converted record #25
Converted record #26
Converted record #27
Converted record #28
Converted record #29
Converted record #30
Converted record #31
Converted record #32
Converted record #33
Converted record #34
Converted record #35
Converted record #36
Converted record #37
Converted record #38
Converted record #39
Converted record #40
Converted record #41
Converted record #42
Converted record #43
Converted record #44
Converted record #45
Converted record #46
Converted record #47
Converted record #48
C

In [2]:
import json

# Load the transformed instructions
with open('instruction_data.json', 'r') as f:
    data = json.load(f)

print(len(data))
print("Example entry : ", data[50])


1843
Example entry :  {'instruction': 'Provide wellbeing insight for the below text with normal.', 'input': 'In a first since 1938, Des Moines, Iowa, kids will trickortreat on Halloween', 'output': "The text describes a return to normalcy, indicating potentially positive impacts on psychological wellbeing.\n\n**Ryff Scale Insights:**\n\n*   **Autonomy:** Resuming activities fosters independence. Encourage decision-making in Halloween plans.\n*   **Environmental Mastery:** Reclaiming traditions builds competence. Help kids navigate trick-or-treating logistics.\n*   **Personal Growth:** New experiences aid development. Encourage exploring different costumes/neighborhoods.\n*   **Positive Relations:** Social events strengthen bonds. Facilitate interaction with friends/neighbors.\n*   **Purpose in Life:** Participating in community rituals provides meaning. Discuss Halloween's cultural significance.\n*   **Self-Acceptance:** Normalcy can boost confidence. Celebrate enjoyment of the traditi

# Converting Instructions into Alpaca format

In [3]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [4]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide wellbeing insight for the below text with normal.

### Input:
In a first since 1938, Des Moines, Iowa, kids will trickortreat on Halloween

### Response:
The text describes a return to normalcy, indicating potentially positive impacts on psychological wellbeing.

**Ryff Scale Insights:**

*   **Autonomy:** Resuming activities fosters independence. Encourage decision-making in Halloween plans.
*   **Environmental Mastery:** Reclaiming traditions builds competence. Help kids navigate trick-or-treating logistics.
*   **Personal Growth:** New experiences aid development. Encourage exploring different costumes/neighborhoods.
*   **Positive Relations:** Social events strengthen bonds. Facilitate interaction with friends/neighbors.
*   **Purpose in Life:** Participating in community rituals provides meaning. Discuss Halloween's cultural significance.
*   **Self-A

In [5]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide wellbeing insight for the below text with normal.

### Input:
Harvey Weinstein sentenced to 23 years in prison for sex assault in case that sparked MeToo movement

### Response:
This news may evoke strong emotions. Here's how to maintain wellbeing:

*   **Autonomy:** Acknowledge your feelings without letting them dictate your actions. Focus on what you *can* control.
*   **Env. Mastery:** Channel frustration into constructive action, like supporting related causes.
*   **Personal Growth:** Reflect on your values and how this news impacts them. Use it as a catalyst for growth.
*   **Positive Relations:** Connect with others, share your feelings, and offer support.
*   **Purpose in Life:** Reaffirm your values & find meaning in fighting injustice.
*   **Self-Acceptance:** Validate your emotional response; accept it's okay to feel upset.


# Splitting dataset into train test validation

In [6]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [7]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 1566
Validation set length: 93
Test set length: 184


In [8]:
print(train_data[0])
print(test_data[0])
print(val_data[0])

{'instruction': 'Provide wellbeing insight for the below text with normal.', 'input': 'Boeing, union reach sweetened contract offer in bid to end strike, vote scheduled for Monday', 'output': "The text suggests a return to normalcy at Boeing, potentially reducing employee stress. Here's wellbeing advice:\n\n*   **Autonomy:** Exercise choice in daily tasks.\n*   **Environmental Mastery:** Tackle a small, achievable home project.\n*   **Personal Growth:** Learn a new skill, even a simple one.\n*   **Positive Relations:** Connect with a coworker or friend.\n*   **Purpose in Life:** Reflect on your work's contribution.\n*   **Self-Acceptance:** Acknowledge your strengths in resolving the situation."}
{'instruction': 'Provide wellbeing insight for the below text with normal.', 'input': 'The US debt is now projected to be larger than the US economy', 'output': "The text evokes anxiety about economic stability. Here's wellbeing advice:\n\n*   **Autonomy:** Research diverse economic viewpoints

# Organising data into training batches

In [9]:
import torch
from torch.utils.data import Dataset


class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [10]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m46.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [11]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [12]:
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra
    # padding token below
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs
    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to batch_max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        # Via padded[:-1], we remove the extra padded token
        # that has been added via the +1 setting in batch_max_length
        # (the extra padding token will be relevant in later codes)
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [13]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


# Creating target token ids for training

In [14]:
def custom_collate_draft_2(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [15]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)


tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [16]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [17]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [18]:
logits_1 = torch.tensor(
    [[-1.0, 1.0],  # 1st training example
     [-0.5, 1.5]]  # 2nd training example
)
targets_1 = torch.tensor([0, 1])


loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [19]:
logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]  # New 3rd training example
)
targets_2 = torch.tensor([0, 1, 1])

loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [20]:
logits_2 = torch.tensor(
    [[-1.0, 1.0],
     [-0.5, 1.5],
     [-0.5, 1.5]]  # New 3rd training example
)

targets_3 = torch.tensor([0, 1, -100])

loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)

tensor(1.1269)
loss_1 == loss_3: tensor(True)


# Creating dataloaders for instruction dataset

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is much faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

#if torch.cuda.is_available():
#    device = torch.device("cuda")
#elif torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")

print("Device:", device)

Device: cuda


In [22]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [23]:
from torch.utils.data import DataLoader


num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [24]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 584]) torch.Size([8, 584])
torch.Size([8, 560]) torch.Size([8, 560])
torch.Size([8, 539]) torch.Size([8, 539])
torch.Size([8, 1024]) torch.Size([8, 1024])
torch.Size([8, 820]) torch.Size([8, 820])
torch.Size([8, 424]) torch.Size([8, 424])
torch.Size([8, 713]) torch.Size([8, 713])
torch.Size([8, 293]) torch.Size([8, 293])
torch.Size([8, 553]) torch.Size([8, 553])
torch.Size([8, 448]) torch.Size([8, 448])
torch.Size([8, 530]) torch.Size([8, 530])
torch.Size([8, 530]) torch.Size([8, 530])
torch.Size([8, 875]) torch.Size([8, 875])
torch.Size([8, 309]) torch.Size([8, 309])
torch.Size([8, 448]) torch.Size([8, 448])
torch.Size([8, 1024]) torch.Size([8, 1024])
torch.Size([8, 447]) torch.Size([8, 447])
torch.Size([8, 764]) torch.Size([8, 764])
torch.Size([8, 352]) torch.Size([8, 352])
torch.Size([8, 605]) torch.Size([8, 605])
torch.Size([8, 473]) torch.Size([8, 473])
torch.Size([8, 417]) torch.Size([8, 417])
torch.Size([8, 803]) torch.Size([8, 803])
torch.Size([8, 6

# Loading a pretrained SLM

In [42]:

# Import Libraries
import tiktoken
import torch
import torch.nn as nn
import os
from torch.utils.data import Dataset, DataLoader

GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": True       # Query-Key-Value bias
}

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), ## Expansion
            GELU(), ## Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), ## Contraction
        )

    def forward(self, x):
        return self.layers(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # 2*4*768
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
        # 2*4*768

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

def generate_text_simple(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    # idx is (batch, n_tokens) array of indices in the current context

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

file_path = "filtered_articles.txt"

with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

tokenizer = tiktoken.get_encoding("gpt2")

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=16, max_length=1024,
                         stride=512, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval();


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

def save_checkpoint(epoch, model, optimizer, save_path="model_checkpoint1.pth"):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    torch.save(checkpoint, save_path)
    print(f"Checkpoint saved at epoch {epoch + 1}")

Characters: 121892397
Tokens: 26316250


In [26]:
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("foundation_model_v2.pth", map_location="cpu"))
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [45]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [30]:
torch.manual_seed(123)
input_text = format_input(val_data[0]) # Changed val_data to data
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide wellbeing insight for the below text with anxiety.

### Input:
Normal? Ive been stuck in a panic attack cycle for about 7 months now. Constant. Once it stops it immediately picks back up again. Today, I prayed to God and asked him to help me, to take this pain away, and since then my internal shaking has stopped and I havent had any health anxiety thoughts. I havent felt like this is months and Im freaked out to have panic attacks and freaked out once I dont have them anymore. Help


In [31]:
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer),
    max_new_tokens=35,
    context_size=GPT_CONFIG_124M["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [32]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

me.

### Response:

Ive been stuck in a panic attack cycle for about 7 months now. Constant. Once it stops it immediately picks back up again


# Finetuning the LLM

In [43]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


In [None]:
model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

In [None]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 1

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 10.737, Val loss 10.753
Ep 1 (Step 000005): Train loss 9.778, Val loss 9.767
Ep 1 (Step 000010): Train loss 9.415, Val loss 9.405
Ep 1 (Step 000015): Train loss 9.302, Val loss 9.197
Ep 1 (Step 000020): Train loss 9.010, Val loss 9.015
Ep 1 (Step 000025): Train loss 8.851, Val loss 8.849
Ep 1 (Step 000030): Train loss 8.751, Val loss 8.697
Ep 1 (Step 000035): Train loss 8.588, Val loss 8.556
Ep 1 (Step 000040): Train loss 8.436, Val loss 8.423
Ep 1 (Step 000045): Train loss 8.367, Val loss 8.312
Ep 1 (Step 000050): Train loss 8.273, Val loss 8.216
Ep 1 (Step 000055): Train loss 8.182, Val loss 8.120
Ep 1 (Step 000060): Train loss 8.084, Val loss 8.036
Ep 1 (Step 000065): Train loss 7.997, Val loss 7.969
Ep 1 (Step 000070): Train loss 7.887, Val loss 7.902
Ep 1 (Step 000075): Train loss 7.893, Val loss 7.848
Ep 1 (Step 000080): Train loss 7.817, Val loss 7.801
Ep 1 (Step 000085): Train loss 7.772, Val loss 7.763
Ep 1 (Step 000090): Train loss 7.597, Val lo