# **Check for GPU Presence and Useability**

In [2]:
!nvidia-smi

Wed Aug  6 18:10:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.43                 Driver Version: 566.43         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P0             15W /   73W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# **Importing required packages**

In [64]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm.auto import tqdm

# **Setting The Configurations (Hyperparameters for this Model)**

In [83]:
CONFIG = {
    'csv_file_path': r"C:\Users\Webbies\Jupyter_Notebooks\LLM_FineTune\Website_Data_Samples_1000.csv",
    'train_ratio': 0.8,
    'd_model': 256,
    'nhead': 4,
    'num_encoder_layers': 2,
    'num_decoder_layers': 2,
    'dim_feedforward': 512,
    'dropout': 0.1,
    'num_epochs': 200, # This need to be increased for better performance with early stopping parameter
    'batch_size': 4,  # Increase batch size for more data
    'learning_rate': 1e-4,
    'pad_token': 2,
    'sos_token': 0,
    'eos_token': 1,
    'max_seq_len': 512,
    'model_save_path': 'AI_WebsiteDesigner_LLM.pth'
}

<h1 style = "color:green">Future Modifications That are Needed for this Complete Process</h1>

* Set the Number of Epochs to 300 or 400 with an early stopping parameter to stop the model from overfitting. Setting the epoch number to higher is needed because currently with 200 epochs, it is observed that for 200 epochs the training loss is still decreasing. 
* Increase the number of batch size from 4 to 16 or even 32 to make the model efficient by speeding the train process
* Increase the value of the 'max_seq_len' parameter from 512 to 1024 or more to generate long codes
* **All this modifications will lead to greater training process and will consume more memory and GPU**
* Currently, this model building process utilizes F1 score and BLUE score that matches tokens between generated output and actual output. It performs more efficiently than the normal accuracy score that calculates percentage match as missing a single white space or comma can yield drastic mismatch.

<h3 style = "color:green">What are the Key Features to Improve the Performance</h3>

* Increase the number of Epochs (currently 200). Make it 300 to 400
* Increase the volume of the data
* Increase the Batch size to 16 or 32
* Use F1 score or BLUE as evaluation metric
* Increase the sequence length to 1024 or more big to generate more codes

# **Download necessary NLTK data (this only needs to be run once)**

In [69]:
try:
    nltk.data.find('tokenizers/punkt')
    print("Data Already Found")
except nltk.downloader.DownloadError:
    print("Downloading 'punkt' data for NLTK...")
    nltk.download('punkt')

Data Already Found


# **Use GPU if available**

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


# **Data Loading and Preprocessing**

In [10]:
def load_process_split_data(csv_file_path, train_ratio):
    """
    Loads data from a CSV and performs a train-test split using sklearn.
    """
    try:
        df = pd.read_csv(csv_file_path)
    except FileNotFoundError:
        print(f"Error: The file '{csv_file_path}' was not found. Please ensure it is in the correct directory.")
        return None, None, None, None

    # Rename columns for clarity and consistency
    df.rename(columns={'llm_generated_idea': 'prompt', 'text': 'code'}, inplace=True)

    # Split the data into training and testing sets
    train_df, test_df = train_test_split(df, test_size=1-train_ratio, random_state=42)

    train_data = list(zip(train_df['prompt'], train_df['code']))
    test_data = list(zip(test_df['prompt'], test_df['code']))

    prompts = df['prompt'].tolist()
    codes = df['code'].tolist()

    return train_data, test_data, prompts, codes

# **Building the Complete Vocabulary for this Project**

In [12]:
def build_vocab(prompts, codes):
    """
    Builds a character-level vocabulary from the prompts and codes.
    """
    prompt_vocab = set(" ".join(prompts))
    code_vocab = set(" ".join(codes))

    prompt_char_to_idx = {char: i + 3 for i, char in enumerate(sorted(list(prompt_vocab)))}
    code_char_to_idx = {char: i + 3 for i, char in enumerate(sorted(list(code_vocab)))}

    special_tokens = {
        '<sos>': CONFIG['sos_token'],
        '<eos>': CONFIG['eos_token'],
        '<pad>': CONFIG['pad_token']
    }
    prompt_char_to_idx.update(special_tokens)
    code_char_to_idx.update(special_tokens)

    prompt_idx_to_char = {idx: char for char, idx in prompt_char_to_idx.items()}
    code_idx_to_char = {idx: char for char, idx in code_char_to_idx.items()}

    max_prompt_len = max(len(p) for p in prompts) + 2 # +2 for <sos> and <eos>
    max_code_len = max(len(c) for c in codes) + 2    # +2 for <sos> and <eos>

    print(f"Prompt vocab size: {len(prompt_char_to_idx)}")
    print(f"Code vocab size: {len(code_char_to_idx)}")
    print(f"Maximum prompt length: {max_prompt_len}")
    print(f"Maximum code length: {max_code_len}")

    return (prompt_char_to_idx, prompt_idx_to_char,
            code_char_to_idx, code_idx_to_char,
            max_prompt_len, max_code_len)

# **The Tokenization Process**

In [14]:
def tokenize(text, char_to_idx, add_sos=True, add_eos=True):
    """Converts a text string into a list of token indices."""
    tokens = [char_to_idx[char] for char in text]
    if add_sos:
        tokens.insert(0, CONFIG['sos_token'])
    if add_eos:
        tokens.append(CONFIG['eos_token'])
    return tokens

# **Pytorch Dataset and DataLoader & The Collate Function**

In [16]:
class NLPtoCodeDataset(Dataset):
    def __init__(self, data, prompt_char_to_idx, code_char_to_idx):
        self.data = data
        self.prompt_char_to_idx = prompt_char_to_idx
        self.code_char_to_idx = code_char_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt, code = self.data[idx]
        prompt_tensor = torch.tensor(tokenize(prompt, self.prompt_char_to_idx), dtype=torch.long)
        code_tensor = torch.tensor(tokenize(code, self.code_char_to_idx), dtype=torch.long)
        return prompt_tensor, code_tensor

In [17]:
def collate_fn(batch):
    """
    Pads sequences to the same length within a batch.
    """
    prompts, codes = zip(*batch)
    prompts_padded = pad_sequence(prompts, batch_first=True, padding_value=CONFIG['pad_token'])
    codes_padded = pad_sequence(codes, batch_first=True, padding_value=CONFIG['pad_token'])
    return prompts_padded.to(device), codes_padded.to(device)

# **Transformer Model Definition**

In [19]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len):  # max_len is no longer a hardcoded default
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [20]:
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead,
                 num_encoder_layers, num_decoder_layers, dim_feedforward,
                 dropout, max_src_len, max_tgt_len):  # Added max_src_len and max_tgt_len
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Instantiate PositionalEncoding with the dynamic max lengths
        self.pos_encoder_src = PositionalEncoding(d_model, dropout, max_len=max_src_len)
        self.pos_encoder_tgt = PositionalEncoding(d_model, dropout, max_len=max_tgt_len)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_padding_mask=None, tgt_padding_mask=None):
        src = self.pos_encoder_src(self.src_embedding(src))
        tgt = self.pos_encoder_tgt(self.tgt_embedding(tgt))
        output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask,
                                  src_key_padding_mask=src_padding_mask,
                                  tgt_key_padding_mask=tgt_padding_mask)
        return self.fc_out(output)

# **Training and Evaluation Functions**

In [22]:
def create_masks(src, tgt):
    src_padding_mask = (src == CONFIG['pad_token'])
    tgt_padding_mask = (tgt == CONFIG['pad_token'])
    tgt_len = tgt.size(1)
    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_len).to(device)
    return src_padding_mask, tgt_padding_mask, tgt_mask

In [23]:
def train_one_epoch(model, dataloader, optimizer, criterion):
    """Performs a single training loop over the dataset."""
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in dataloader:
        tgt_input = tgt_batch[:, :-1]
        tgt_output = tgt_batch[:, 1:]

        src_padding_mask, tgt_padding_mask, tgt_mask = create_masks(src_batch, tgt_input)

        output = model(src_batch, tgt_input, src_mask=None, tgt_mask=tgt_mask,
                       src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask)

        loss = criterion(output.view(-1, output.size(-1)), tgt_output.reshape(-1))
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    return total_loss / len(dataloader)

In [24]:
def evaluate_model(model, test_data, prompt_char_to_idx, code_idx_to_char):
    """
    Evaluates the model on the test set and provides a simple accuracy metric.
    We count how many generated codes exactly match the ground truth.
    """
    model.eval()
    correct_predictions = 0
    total_predictions = len(test_data)

    for prompt, ground_truth_code in test_data:
        generated_code = translate(model, prompt, prompt_char_to_idx, code_idx_to_char)
        # Remove special tokens from ground truth for fair comparison
        ground_truth_cleaned = ground_truth_code.replace('<sos>', '').replace('<eos>', '')
        if generated_code.strip() == ground_truth_cleaned.strip():
            correct_predictions += 1

    accuracy = (correct_predictions / total_predictions) * 100
    return accuracy

###  **New Evaluation Function with F1 and BLEU scores**

In [72]:
def evaluate_model_with_metrics(model, test_data, prompt_char_to_idx, code_idx_to_char):
    """
    Evaluates the model on the test set using F1-score, Precision, Recall, and BLEU score.
    This provides a more robust measure of partial correctness.
    
    Args:
        model: The trained Transformer model.
        test_data: The list of test prompt-code pairs.
        prompt_char_to_idx: Dictionary mapping prompt characters to indices.
        code_idx_to_char: Dictionary mapping code indices to characters.
    
    Returns:
        A tuple containing the calculated precision, recall, f1_score, and bleu_score.
    """
    model.eval()
    all_ground_truth_tokens = []
    all_generated_tokens = []
    
    bleu_scores = []
    chencherry = SmoothingFunction()

    # Disable gradient calculations for evaluation
    with torch.no_grad():
        for prompt, ground_truth_code in tqdm(test_data, desc="Evaluating"):
            # Generate code from the prompt
            generated_code = translate(model, prompt, prompt_char_to_idx, code_idx_to_char)
            
            # --- Prepare data for F1-score ---
            # Create a list of integer tokens for comparison
            # We use an inverse mapping for generated tokens to ensure they are consistent
            ground_truth_token_ids = [
                code_char_to_idx.get(char, CONFIG['pad_token']) for char in ground_truth_code
            ]
            generated_token_ids = [
                code_char_to_idx.get(char, CONFIG['pad_token']) for char in generated_code
            ]

            # Pad the shorter sequence to match the length of the longer one
            max_len = max(len(ground_truth_token_ids), len(generated_token_ids))
            ground_truth_token_ids += [CONFIG['pad_token']] * (max_len - len(ground_truth_token_ids))
            generated_token_ids += [CONFIG['pad_token']] * (max_len - len(generated_token_ids))
            
            all_ground_truth_tokens.extend(ground_truth_token_ids)
            all_generated_tokens.extend(generated_token_ids)

            # --- Prepare data for BLEU score ---
            # Tokenize using characters
            reference_tokens = [[char for char in ground_truth_code]]
            candidate_tokens = [char for char in generated_code]
            
            # Calculate BLEU score for this single sentence and add to the list
            # We use SmoothingFunction to handle cases with no n-gram matches
            score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=chencherry.method1)
            bleu_scores.append(score)

    # Calculate overall F1, Precision, and Recall scores for all tokens
    # 'weighted' averaging is used to account for class imbalance
    precision = precision_score(all_ground_truth_tokens, all_generated_tokens, average='weighted', zero_division=0)
    recall = recall_score(all_ground_truth_tokens, all_generated_tokens, average='weighted', zero_division=0)
    f1 = f1_score(all_ground_truth_tokens, all_generated_tokens, average='weighted', zero_division=0)
    
    # Calculate the average BLEU score
    avg_bleu = np.mean(bleu_scores)
    
    return precision, recall, f1, avg_bleu

In [25]:
def translate(model, prompt, prompt_char_to_idx, code_idx_to_char):
    """Generates code from a given prompt using the trained model."""
    model.eval()
    src_tokens = tokenize(prompt, prompt_char_to_idx)
    src_tensor = torch.tensor(src_tokens, dtype=torch.long).unsqueeze(0).to(device)
    tgt_tokens = [CONFIG['sos_token']]

    with torch.no_grad():
        for _ in range(CONFIG['max_seq_len']):
            tgt_tensor = torch.tensor(tgt_tokens, dtype=torch.long).unsqueeze(0).to(device)
            _, _, tgt_mask = create_masks(src_tensor, tgt_tensor)
            output = model(src_tensor, tgt_tensor, tgt_mask=tgt_mask)
            next_token_logits = output[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).item()
            if next_token == CONFIG['eos_token']:
                break
            tgt_tokens.append(next_token)

    generated_code = "".join([code_idx_to_char[token] for token in tgt_tokens[1:] if token != CONFIG['eos_token']])
    return generated_code

# **The Main Execution Block Starts from Here**

In [27]:
# 1. Data Preparation
train_data, test_data, prompts, codes = load_process_split_data(CONFIG['csv_file_path'], CONFIG['train_ratio'])
if train_data is None:
    print("Exiting due to data loading failure.")
    exit()

In [28]:
# 2. Vocabulary Building
(prompt_char_to_idx, prompt_idx_to_char,
 code_char_to_idx, code_idx_to_char,
 max_prompt_len, max_code_len) = build_vocab(prompts, codes)

Prompt vocab size: 71
Code vocab size: 98
Maximum prompt length: 476
Maximum code length: 4805


In [29]:
# 3. Separating the train and test data with their respective lengths
train_dataset = NLPtoCodeDataset(train_data, prompt_char_to_idx, code_char_to_idx)
test_dataset = NLPtoCodeDataset(test_data, prompt_char_to_idx, code_char_to_idx)

train_dataloader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, collate_fn=collate_fn)

print(f"Training data size: {len(train_dataset)}")
print(f"Test data size: {len(test_dataset)}")
print(f"Number of training batches: {len(train_dataloader)}")

Training data size: 800
Test data size: 200
Number of training batches: 200


In [30]:
# 4. Model, Optimizer, and Loss Function Initialization
model = TransformerModel(
    src_vocab_size=len(prompt_char_to_idx),
    tgt_vocab_size=len(code_char_to_idx),
    d_model=CONFIG['d_model'],
    nhead=CONFIG['nhead'],
    num_encoder_layers=CONFIG['num_encoder_layers'],
    num_decoder_layers=CONFIG['num_decoder_layers'],
    dim_feedforward=CONFIG['dim_feedforward'],
    dropout=CONFIG['dropout'],
    max_src_len=max_prompt_len,
    max_tgt_len=max_code_len
).to(device)

In [31]:
# 5. Setting the Optimizer and Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=CONFIG['pad_token'])

In [32]:
# 6. The Main Training Loop
for epoch in range(CONFIG['num_epochs']):
    avg_loss = train_one_epoch(model, train_dataloader, optimizer, criterion)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{CONFIG['num_epochs']}], Loss: {avg_loss:.4f}")

print("--- Training complete. ---")

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch [10/200], Loss: 1.7585
Epoch [20/200], Loss: 1.4368
Epoch [30/200], Loss: 1.2272
Epoch [40/200], Loss: 1.0830
Epoch [50/200], Loss: 0.9858
Epoch [60/200], Loss: 0.9022
Epoch [70/200], Loss: 0.8307
Epoch [80/200], Loss: 0.7680
Epoch [90/200], Loss: 0.7181
Epoch [100/200], Loss: 0.6768
Epoch [110/200], Loss: 0.6421
Epoch [120/200], Loss: 0.6107
Epoch [130/200], Loss: 0.5837
Epoch [140/200], Loss: 0.5589
Epoch [150/200], Loss: 0.5371
Epoch [160/200], Loss: 0.5187
Epoch [170/200], Loss: 0.5023
Epoch [180/200], Loss: 0.4848
Epoch [190/200], Loss: 0.4698
Epoch [200/200], Loss: 0.4571
--- Training complete. ---


In [33]:
# 7. Save the trained model
print("\n--- Saving the trained model ---")
torch.save(model.state_dict(), CONFIG['model_save_path'])
print(f"Model saved to {CONFIG['model_save_path']}")


--- Saving the trained model ---
Model saved to transformer_llm.pth


In [74]:
# 8. The Modified Evaluation Process for getting the F1 and BLUE score
print("\n--- Evaluating model performance with F1-score and BLEU score ---")
# Call the new evaluation function
precision, recall, f1, bleu = evaluate_model_with_metrics(model, test_data, prompt_char_to_idx, code_idx_to_char)

print(f"Model Precision on Test Set: {precision:.4f}")
print(f"Model Recall on Test Set: {recall:.4f}")
print(f"Model F1-Score on Test Set: {f1:.4f}")
print(f"Model BLEU Score on Test Set: {bleu:.4f}")


--- Evaluating model performance with F1-score and BLEU score ---


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

Model Precision on Test Set: 0.3055
Model Recall on Test Set: 0.0968
Model F1-Score on Test Set: 0.1453
Model BLEU Score on Test Set: 0.1168


In [76]:
# 9. Final Inference Example
print("\n--- Final Inference Example ---")
test_prompt, ground_truth_code = test_data[0]
generated_code = translate(model, test_prompt, prompt_char_to_idx, code_idx_to_char)

print(f"Test Prompt: '{test_prompt}'")
print("-" * 50)
print(f"Generated Code: \n{generated_code}")
print("-" * 50)
print(f"Ground Truth Code:\n{ground_truth_code}")


--- Final Inference Example ---
Test Prompt: 'Tech Company: A minimalist design with a large, central hero image, navigation menu on the top, and a left sidebar for information about the team, services, and contact details. The footer can have a simple layout with social media icons and copyright information.'
--------------------------------------------------
Generated Code: 
<html>
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
<body class="bg-gray-100 font-sans leading-normal tracking-normal">
    <header class="bg-white text-center">
         <h1 class="text-4xl font-bold mb-4">Welcome to Our Tech Company</h1>
          <p class="text-lg text-gray-600">
                At Tech Company, we are dedicated to providing innovative and services. Our team of experts is committed to delivering the best possible services. We are co
--------------------------------------------------
Ground Truth Code:
<html>
<link href="https://cdn.jsdel