<a href="https://colab.research.google.com/github/angelaxli/DomainRAG/blob/main/FINALRag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas faiss-cpu numpy pytrends transformers datasets torch scikit-learn evaluate

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux

In [None]:
from scipy.stats import mstats  # Import for winsorizing
from transformers import Trainer  # Import the base Trainer class
import torch.nn as nn

In [None]:
import torch
import evaluate
import faiss
import gc
import pandas as pd
import numpy as np
from transformers import (
    DPRQuestionEncoder,
    DPRContextEncoder,
    DPRQuestionEncoderTokenizer,
    DPRContextEncoderTokenizer,
    get_linear_schedule_with_warmup,
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
import logging
import os

In [None]:
# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
class Config:
    csv_path = "/content/Knowledge Base Real (9).csv"  # Replace with your actual path
    question_encoder_model = "facebook/dpr-question_encoder-single-nq-base"
    context_encoder_model = "facebook/dpr-ctx_encoder-single-nq-base"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bf16 = torch.float32  # Always use float32
    num_negatives = 10
    max_length = 128  # Keep consistent max length, 500 is likely too long and may cause OOM
    epochs = 10
    patience = 2
    accumulation_steps = 4
    batch_size = 2
    learning_rate = 1e-5
    warmup_steps = 0  # Could consider a small warmup, e.g., 10% of total steps
    max_grad_norm = 1.0 # For gradient clipping
    log_every = 10
    output_model_path = "best_dpr_model.pth"
    tensorboard_log_dir = "runs/dpr_training"

config = Config()

try:
    df = pd.read_csv(config.csv_path)
except FileNotFoundError:
    logger.error(f"Error: CSV file not found at {config.csv_path}")
    exit(1)

# --- Inspect Raw Price Values ---
print("Original Price Statistics:")
print(df['Price'].describe())
print("\nOriginal Price - Smallest 5 Values:\n", df['Price'].nsmallest(5))
print("\nOriginal Price - Largest 5 Values:\n", df['Price'].nlargest(5))


# --- Data Preprocessing ---

# 1. Winsorize the 'Price' column (handle outliers)
df['Price'] = mstats.winsorize(df['Price'], limits=[0.01, 0.01])  # Winsorize at 1st and 99th percentiles

# 2. Log Transformation
df['Price'] = np.log1p(df['Price'])

# --- Data Normalization ---
price_scaler = MinMaxScaler()  # Scaler for Price (log-transformed)
other_scaler = MinMaxScaler() # Scaler for the other numeric columns.
for col in ['Price', 'Monthly Searches', 'CPC (Exact)', 'CPC (Phrase)']:
    if col in df.columns:
        if col == 'Price':
            df[col] = price_scaler.fit_transform(df[[col]]) # Then scale
        else:
            # Use the other_scaler for other columns
            df[col] = other_scaler.fit_transform(df[[col]])
    else:
        logger.warning(f"Column {col} not found in dataframe. Skipping normalization.")

Original Price Statistics:
count       149.000000
mean       7924.308725
std       21193.447977
min         160.000000
25%        3020.000000
50%        3827.000000
75%        6000.000000
max      200000.000000
Name: Price, dtype: float64

Original Price - Smallest 5 Values:
 52     160
17     206
71     266
30     310
143    570
Name: Price, dtype: int64

Original Price - Largest 5 Values:
 53     200000
5      130000
6      115000
124     22150
99      21100
Name: Price, dtype: int64


In [None]:
# --- Model and Tokenizer Initialization ---
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(config.question_encoder_model)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(config.context_encoder_model)

question_encoder = DPRQuestionEncoder.from_pretrained(config.question_encoder_model).to(config.device)
context_encoder = DPRContextEncoder.from_pretrained(config.context_encoder_model).to(config.device)


# --- Keyword Preprocessing ---
def preprocess_keywords(keywords):
    if isinstance(keywords, str):
        keywords = keywords.lower()
        tokens = context_tokenizer.tokenize(keywords)
        tokens = [token for token in tokens if token.isalnum() and token not in context_tokenizer.all_special_tokens]
        return context_tokenizer.convert_tokens_to_string(tokens)
    else:
        return ""

df['Keywords'] = df['Keywords'].apply(preprocess_keywords)

# --- Context Creation Function ---
def create_context_string(row):
    price_bins = np.digitize(row['Price'], bins=[0, 0.25, 0.5, 0.75, 1]) if 'Price' in row else "N/A"
    monthly_searches_log = np.log1p(row['Monthly Searches']) if 'Monthly Searches' in row else "N/A"

    context_parts = [
        f"Domain: {row.get('Domain Name', 'N/A')}",
        f"Price: {price_bins}",
        f"Date: {row.get('Date', 'N/A')}",
        f"Keywords: {row.get('Keywords', 'N/A')}",
        f"Monthly Searches: {monthly_searches_log:.4f}" if isinstance(monthly_searches_log, float) else "N/A",
        f"CPC (Exact): {row.get('CPC (Exact)', 'N/A'):.4f}" if 'CPC (Exact)' in row else "N/A", # Use get and check
        f"CPC (Phrase): {row.get('CPC (Phrase)', 'N/A'):.4f}" if 'CPC (Phrase)' in row else "N/A", # Use get and check
        f"Length: {row.get('Length', 'N/A')}",
        f"Hyphens: {not row.get('Excludes Hyphens', True)}",
        f"Category: {row.get('Category', 'N/A')}"
    ]
    return ", ".join(context_parts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def generate_similarity_based_hard_negatives(df, context_encoder, context_tokenizer, num_negatives=3):
    """
    Generates hard negatives using FAISS, processing contexts INDIVIDUALLY.
    """
    context_encoder.eval()  # Ensure eval mode
    hard_negatives_by_index = {}
    context_embeddings = []

    with torch.no_grad():
        # 1. Generate embeddings *individually* for each context
        for _, row in df.iterrows():
            context = create_context_string(row)
            context_input = context_tokenizer(context, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
            #*** START of change.  Force to float32 type.
            #with torch.autocast(device_type=config.device.type, dtype=config.bf16, enabled=(config.device.type == 'cuda')):
            with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
                embedding = context_encoder(**context_input).pooler_output.detach().cpu().float().numpy()
            #*** END of change.
            # Check if embedding has any values before flattening and appending
            if embedding.size > 0:  # Or any other check that makes sense for your data/model
                context_embeddings.append(embedding.flatten())  # Flatten to 1D array
            else:
                print(f"WARNING: Empty embedding for row index {_}. Skipping this row for FAISS index.")

    # 2. Build FAISS index *after* getting all embeddings
    context_embeddings = np.array(context_embeddings)  # Convert list to NumPy array

    # Check if any context embeddings were generated
    if context_embeddings.size == 0:
        print("WARNING: No valid context embeddings were generated. Returning empty hard_negatives_by_index.")
        return hard_negatives_by_index  # Return empty dictionary if no embeddings

    index = faiss.IndexFlatL2(context_embeddings.shape[1])
    index.add(context_embeddings)

    # 3. Find hard negatives
    for i, row in df.iterrows():
        current_embedding = context_embeddings[i:i + 1] # No change here
        #Check if current_embedding is not empty before proceeding
        if current_embedding.size > 0:
          D, I = index.search(current_embedding, k=num_negatives + 1)
          similar_indices = [idx for idx in I.flatten() if idx != i][:num_negatives]
          hard_negatives_by_index[i] = similar_indices

          # --- DIAGNOSTIC PRINTS (Keep these!) ---
          print(f"--- Row {i}: Domain = {row['Domain Name']} ---")
          print(f"  Embedding (first 10): {current_embedding[0, :10]}")
          print(f"  Distances: {D}")
          print(f"  Indices (Original DF): {similar_indices}")  # Print original indices
          print("-" * 30)
        else:
            print(f"WARNING: Empty embedding for row index {i} during hard negatives search. Skipping this row.")

    context_encoder.train() # Switch back to training mode
    return hard_negatives_by_index


def prepare_triplets_with_negatives(df, hard_negatives_by_index=None, epoch=None):
    """
    Prepares triplets with a focus on *much* harder negatives and a warm-up strategy.
    Includes extensive diagnostic printing to understand the generated triplets.
    """
    triplets = []
    for idx, row in df.iterrows():
        question = f"What is the estimated value of {row['Domain Name']}?"
        positive_context = create_context_string(row)

        # --- 1. Hard Negatives from FAISS (if available) ---
        hard_negatives = []
        if hard_negatives_by_index is not None and idx in hard_negatives_by_index:
            for hard_negative_idx in hard_negatives_by_index[idx]:
                hard_negative = df.iloc[hard_negative_idx]
                hard_negatives.append(create_context_string(hard_negative))


        # --- 2. Stricter Fallback (if FAISS doesn't provide enough) ---
        if  epoch is not None and epoch < 2: # Warm up.
            random_negative = df.sample(1).iloc[0]
            random_negative_context = create_context_string(random_negative)
            # During warm-up epochs, use *only* a random negative.
            negatives = [random_negative_context]
            print(f"--- Triplet {idx} (WARM-UP EPOCH) ---")
            print(f"  Question: {question}")
            print(f"  Positive Context: {positive_context}")
            print(f"  Random Negative Context: {random_negative_context}")
            print("-" * 30)

        else: #After warming up.
            if not hard_negatives: # Only if we don't have FAISS negatives
                # 1. MUST be the same Category
                candidates = df[df['Category'] == row['Category']].copy()  # USE .copy()!

                # 2. MUCH Tighter Price Range (on the TRANSFORMED scale)
                candidates = candidates[
                    (candidates['Price'] >= row['Price'] - 0.02) & (candidates['Price'] <= row['Price'] + 0.02)
                ]

                # 3. Keyword Overlap (Stricter - at least 2 keywords, case-insensitive)
                if 'Keywords' in row and isinstance(row['Keywords'], str):
                    def keyword_overlap_count(x):
                        if isinstance(x, str):
                            return sum(1 for k in str(x).lower().split() if k in row['Keywords'].lower())
                        else:
                            return 0
                    candidates['overlap_count'] = candidates['Keywords'].apply(keyword_overlap_count)
                    candidates = candidates[candidates['overlap_count'] >= 2] # Require >= 2 keywords
                    candidates = candidates.drop(columns=['overlap_count'], errors='ignore')


                # 4.  Similar Monthly Searches (if available, and after other filters)
                if 'Monthly Searches' in df.columns and not candidates.empty:
                    candidates = candidates[
                        (candidates['Monthly Searches'] >= row['Monthly Searches'] - 0.05) &
                        (candidates['Monthly Searches'] <= row['Monthly Searches'] + 0.05)
                    ]
                # 5. Similar CPC (if available)
                if 'CPC (Exact)' in df.columns and not candidates.empty:
                  candidates = candidates[
                      (candidates['CPC (Exact)'] >= row['CPC (Exact)'] - 0.05) &
                      (candidates['CPC (Exact)'] <= row['CPC (Exact)'] + 0.05)
                  ]
                if 'CPC (Phrase)' in df.columns and not candidates.empty:
                    candidates = candidates[
                        (candidates['CPC (Phrase)'] >= row['CPC (Phrase)'] - 0.05) &
                        (candidates['CPC (Phrase)'] <= row['CPC (Phrase)'] + 0.05)
                    ]

                # NO LENGTH CHECK - it's a weak signal

                if len(candidates) > 1:  # We still have some candidates
                    hard_negative = candidates.sample(1).iloc[0]
                    hard_negatives.append(create_context_string(hard_negative))

                # We are not falling back to a random choice anymore

            # --- Diagnostic Prints (Keep these!) ---
            print(f"--- Triplet {idx} ---")
            print(f"  Question: {question}")
            print(f"  Positive Context: {positive_context}")

            random_negative = df.sample(1).iloc[0] #Still need random negative
            random_negative_context = create_context_string(random_negative)
            print(f"  Random Negative Context: {random_negative_context}")

            for i, hn in enumerate(hard_negatives):
                print(f"  Hard Negative {i + 1}: {hn}")
            print("-" * 30)

            negatives = [random_negative_context] + hard_negatives  # Combine for the dataset

        triplets.append({
            "question": question,
            "positive": positive_context,
            "random_negative": random_negative_context if 'random_negative_context' in locals() else "",
            "hard_negatives": hard_negatives
        })
    return triplets

In [None]:
# --- Dataset and DataLoader ---
class DomainDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        query = question_tokenizer(sample['question'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        positive = context_tokenizer(sample['positive'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        random_negative = context_tokenizer(sample['random_negative'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        hard_negatives = [context_tokenizer(neg, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length) for neg in sample['hard_negatives']]

        # Move tensors to device and remove extra dimension
        return {
            'query': {k: v.squeeze(0).to(config.device) for k, v in query.items()},
            'positive': {k: v.squeeze(0).to(config.device) for k, v in positive.items()},
            'random_negative': {k: v.squeeze(0).to(config.device) for k, v in random_negative.items()},
            'hard_negatives': [{k: v.squeeze(0).to(config.device) for k, v in hard_negative.items()} for hard_negative in hard_negatives]
        }

def collate_fn(batch):
    queries = {key: torch.stack([d['query'][key] for d in batch]) for key in batch[0]['query'].keys()}
    positives = {key: torch.stack([d['positive'][key] for d in batch]) for key in batch[0]['positive'].keys()}
    random_negatives = {key: torch.stack([d['random_negative'][key] for d in batch]) for key in batch[0]['random_negative'].keys()}
    hard_negatives_list = []
    num_hard_negatives = min(len(d['hard_negatives']) for d in batch)
    for i in range(num_hard_negatives):
        hard_negatives_list.append({key: torch.stack([d['hard_negatives'][i][key] for d in batch]) for key in batch[0]['hard_negatives'][i].keys()})
    return queries, positives, random_negatives, hard_negatives_list

In [None]:
def train_step(query, positive, random_negative, hard_negatives, optimizer, criterion, scaler):
    optimizer.zero_grad()  # Clear gradients here
    # Force float32 for both forward and backward passes
    with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
        query_embedding = question_encoder(**query).pooler_output
        positive_embedding = context_encoder(**positive).pooler_output
        random_negative_embedding = context_encoder(**random_negative).pooler_output

        scores_list = [torch.matmul(query_embedding, positive_embedding.T).diag(),
                       torch.matmul(query_embedding, random_negative_embedding.T).diag()]

        for hard_negative in hard_negatives:
            hard_negative_embedding = context_encoder(**hard_negative).pooler_output
            scores_list.append(torch.matmul(query_embedding, hard_negative_embedding.T).diag())

        scores = torch.stack(scores_list, dim=1)

        batch_size = query_embedding.size(0)
        targets = torch.zeros(batch_size, dtype=torch.long, device=config.device)

        loss = criterion(scores, targets)

    # Continue with gradient scaling and optimization
    scaler.scale(loss).backward()  # Scale the loss
    scaler.unscale_(optimizer) # Unscale before clipping
    torch.nn.utils.clip_grad_norm_(question_encoder.parameters(), max_norm=config.max_grad_norm)
    torch.nn.utils.clip_grad_norm_(context_encoder.parameters(), max_norm=config.max_grad_norm)
    scaler.step(optimizer)  # Step with scaler
    scaler.update()  # Update scaler

    with torch.no_grad(): # Calculate MRR (no gradient needed)
        ranks = torch.argsort(scores, dim=1, descending=True)
        positive_indices = (ranks == 0).nonzero(as_tuple=True)
        if len(positive_indices[0]) > 0:
            positive_ranks = positive_indices[1] + 1
            mrr = torch.mean(1.0 / positive_ranks.float())
        else:
            mrr = torch.tensor(0.0, device=config.device)

    return loss.detach(), mrr.detach() # Return detached losses

In [None]:
def evaluate_model(dataloader, question_encoder, context_encoder):
    question_encoder.eval()
    context_encoder.eval()

    all_scores = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            query, positive, random_negative, hard_negatives = batch

            query_embedding = question_encoder(**query).pooler_output
            positive_embedding = context_encoder(**positive).pooler_output

            scores_list = [torch.matmul(query_embedding, positive_embedding.T).diag()]
            for negative in [random_negative] + hard_negatives:
                negative_embedding = context_encoder(**negative).pooler_output
                scores_list.append(torch.matmul(query_embedding, negative_embedding.T).diag())

            scores = torch.stack(scores_list, dim=1)

            # Corrected indentation:
            all_scores.extend(scores.cpu().float().numpy())  # Convert to float32 before numpy()
            all_labels.extend(torch.zeros(scores.shape[0], dtype=torch.int64).cpu().numpy())

    all_scores = np.array(all_scores)
    all_labels = np.array(all_labels)


    if np.any(all_labels):  # In DPR, we expect all labels to be 0 (positive is always first)
        ndcg = ndcg_score([all_labels], all_scores) # Wrap all_labels in a list
        map_score = average_precision_score(all_labels, all_scores)
    else:
        ndcg = 0.0
        map_score = 0.0

    ranks = np.argsort(all_scores, axis=1)[:, ::-1]
    positive_ranks = (ranks == 0).nonzero()[1] + 1 if np.any(ranks == 0) else np.array([]) # Correct way to get positive ranks
    mrr = np.mean(1.0 / positive_ranks) if len(positive_ranks) > 0 else 0.0

    question_encoder.train()  # Make sure to switch back to train mode
    context_encoder.train()
    return ndcg, map_score, mrr

In [None]:
# --- Training Loop ---
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

num_training_steps = (len(train_df) // config.batch_size // config.accumulation_steps) * config.epochs
optimizer = optim.AdamW(list(question_encoder.parameters()) + list(context_encoder.parameters()), lr=config.learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=num_training_steps)
criterion = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler(enabled=(config.device.type == 'cuda'))

writer = SummaryWriter(log_dir=config.tensorboard_log_dir)

best_val_loss = float('inf')
best_model_state_dict = None

for epoch in range(config.epochs):
    if epoch > 0:
        hard_negatives_by_index = generate_similarity_based_hard_negatives(train_df, context_encoder, context_tokenizer, num_negatives=config.num_negatives)
    else:
        hard_negatives_by_index = None

    train_triplets = prepare_triplets_with_negatives(train_df, hard_negatives_by_index)
    train_dataset = DomainDataset(train_triplets)
    train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

    val_triplets = prepare_triplets_with_negatives(val_df)
    val_dataset = DomainDataset(val_triplets)
    val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)


    for step, batch in enumerate(train_dataloader):
        loss, mrr = train_step(batch[0], batch[1], batch[2], batch[3], optimizer, criterion, scaler)

        if (step + 1) % config.accumulation_steps == 0:
            scheduler.step()
            optimizer.zero_grad()

        if step % config.log_every == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}, MRR: {mrr.item()}")
            writer.add_scalar("Loss/train", loss.item(), epoch * len(train_dataloader) + step)
            writer.add_scalar("MRR/train", mrr.item(), epoch * len(train_dataloader) + step)

    ndcg, map_score, val_mrr = evaluate_model(val_dataloader, question_encoder, context_encoder)
    val_loss = 1 - ndcg  # Using 1-NDCG as a proxy for validation loss

    print(f"Epoch {epoch+1}, Validation NDCG: {ndcg:.4f}, MAP: {map_score:.4f}, MRR: {val_mrr:.4f}")
    writer.add_scalar("NDCG/val", ndcg, epoch)
    writer.add_scalar("MAP/val", map_score, epoch)
    writer.add_scalar("MRR/val", val_mrr, epoch)


    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state_dict = {
            'question_encoder': question_encoder.state_dict(),
            'context_encoder': context_encoder.state_dict()
        }
        torch.save(best_model_state_dict, config.output_model_path)
        logger.info(f"Best model saved at epoch {epoch+1}")

    elif epoch >= config.patience and (val_loss >= best_val_loss):  # Corrected early stopping
        print(f"Early stopping triggered. No improvement in validation loss for {config.patience} epochs.")
        break


# Load the best model after training
if best_model_state_dict is not None:
    best_model_state_dict = torch.load(config.output_model_path)
    question_encoder.load_state_dict(best_model_state_dict['question_encoder'])
    context_encoder.load_state_dict(best_model_state_dict['context_encoder'])
    logger.info("Best model loaded.")
else:
    print("Warning: No best model found. This can happen if training was interrupted.")

torch.cuda.empty_cache()
gc.collect()
writer.close()

print("Training complete.")

--- Triplet 22 ---
  Question: What is the estimated value of petsurance.com?
  Positive Context: Domain: petsurance.com, Price: 3, Date: 12/14/2024, Keywords: pet, Monthly Searches: 0.0000, CPC (Exact): 0.0000, CPC (Phrase): 0.0000, Length: 14, Hyphens: False, Category: Pets
  Random Negative Context: Domain: fiberx.com, Price: 3, Date: 12/29/2024, Keywords: fiber, Monthly Searches: 0.0000, CPC (Exact): 0.0000, CPC (Phrase): 0.0000, Length: 10, Hyphens: False, Category: Unknown
------------------------------
--- Triplet 15 ---
  Question: What is the estimated value of nutripure.org?
  Positive Context: Domain: nutripure.org, Price: 3, Date: 12/15/2024, Keywords: nut, Monthly Searches: 0.0000, CPC (Exact): 0.0000, CPC (Phrase): 0.0000, Length: 13, Hyphens: False, Category: Health
  Random Negative Context: Domain: xinchaotx.com, Price: 3, Date: 12/14/2024, Keywords: xi tx, Monthly Searches: 0.0000, CPC (Exact): 0.0000, CPC (Phrase): 0.0000, Length: 13, Hyphens: False, Category: Food
-

  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):


Epoch 0, Step 0, Loss: 0.0, MRR: 1.0
Epoch 0, Step 10, Loss: 0.0, MRR: 1.0
Epoch 0, Step 20, Loss: 0.0, MRR: 1.0
Epoch 0, Step 30, Loss: 0.0, MRR: 1.0
Epoch 0, Step 40, Loss: 0.0, MRR: 1.0
Epoch 0, Step 50, Loss: 0.0, MRR: 1.0
Epoch 1, Validation NDCG: 0.0000, MAP: 0.0000, MRR: 1.0000


  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):


--- Row 22: Domain = petsurance.com ---
  Embedding (first 10): [ 0.18439789  0.17120269  0.61836445 -0.00791608  0.15962535  0.03851032
  0.112573    0.3901983   0.24903856 -0.36181265]
  Distances: [[  0.        39.60899   54.505905 122.13373  122.64144  125.30623
  128.1232   129.75479  130.10667  130.52063  130.57005 ]]
  Indices (Original DF): [33, 48, 0, 74, 103, 89, 49, 84, 1, 21]
------------------------------
--- Row 15: Domain = nutripure.org ---
  Embedding (first 10): [ 0.18424769  0.5910749   0.77932364 -0.42743227 -0.00214254 -0.14079487
 -0.07187035 -0.25364745 -0.03223198  0.17256236]
  Distances: [[  0.        87.326065 102.94916  106.321266 111.84925  111.849365
  112.65808  114.251595 115.71075  116.37645  117.39933 ]]
  Indices (Original DF): [54, 76, 89, 70, 37, 28, 81, 6, 24, 111]
------------------------------
--- Row 65: Domain = koboldquarterly.com ---
  Embedding (first 10): [ 0.12213489 -0.26707542 -0.02789363 -0.18675141  0.40168795  0.16101234
  0.31493995 

  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):


Epoch 1, Step 0, Loss: 2.9802313861182483e-07, MRR: 1.0
Epoch 1, Step 10, Loss: 0.0, MRR: 1.0
Epoch 1, Step 20, Loss: 1.2278405847609974e-05, MRR: 1.0
Epoch 1, Step 30, Loss: 0.0, MRR: 1.0
Epoch 1, Step 40, Loss: 0.0, MRR: 1.0
Epoch 1, Step 50, Loss: 0.0007349221268668771, MRR: 1.0
Epoch 2, Validation NDCG: 0.0000, MAP: 0.0000, MRR: 1.0000


  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):


--- Row 22: Domain = petsurance.com ---
  Embedding (first 10): [ 0.02676875  0.4854876   0.47925332 -0.04571469 -0.02516865  0.18450153
 -0.06809441  0.1410061   0.41328648 -0.33242813]
  Distances: [[ 0.       26.681458 26.779377 90.4574   93.09235  93.76509  95.86573
  96.08736  97.52202  97.92368  98.58912 ]]
  Indices (Original DF): [48, 33, 1, 21, 81, 56, 49, 93, 0, 104]
------------------------------
--- Row 15: Domain = nutripure.org ---
  Embedding (first 10): [-0.17631288  0.73733866  0.65564424 -0.27892688 -0.17077155  0.06511084
  0.03456419 -0.20876072  0.06429633  0.05886963]
  Distances: [[ 0.       70.99334  79.143036 79.9768   80.08061  82.51124  85.29201
  85.31348  86.38594  87.21451  87.29019 ]]
  Indices (Original DF): [54, 108, 70, 7, 118, 81, 76, 28, 89, 77]
------------------------------
--- Row 65: Domain = koboldquarterly.com ---
  Embedding (first 10): [ 0.10818629  0.00227354  0.02475549 -0.12210399  0.2025458   0.43078262
  0.07587289  0.86684155 -0.2923453

  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):


Epoch 2, Step 0, Loss: 0.0, MRR: 1.0
Epoch 2, Step 10, Loss: 2.270885306643322e-05, MRR: 1.0
Epoch 2, Step 20, Loss: 5.9604641222676946e-08, MRR: 1.0
Epoch 2, Step 30, Loss: 0.0, MRR: 1.0
Epoch 2, Step 40, Loss: 0.00012479665747378021, MRR: 1.0
Epoch 2, Step 50, Loss: 4.09251594543457, MRR: 0.75
Epoch 3, Validation NDCG: 0.0000, MAP: 0.0000, MRR: 0.9833
Early stopping triggered. No improvement in validation loss for 2 epochs.


  best_model_state_dict = torch.load(config.output_model_path)


Training complete.


In [None]:
# --- Data Normalization ---
price_scaler = MinMaxScaler()  # Scaler for Price (log-transformed)
other_scaler = MinMaxScaler() # Scaler for the other numeric columns.
for col in ['Price', 'Monthly Searches', 'CPC (Exact)', 'CPC (Phrase)']:
    if col in df.columns:
        if col == 'Price':
            df['Price'] = np.log1p(df['Price'])  # Log Transform Price
            df[col] = price_scaler.fit_transform(df[[col]]) # Then scale
        else:
            # Use the other_scaler for other columns
            df[col] = other_scaler.fit_transform(df[[col]])
    else:
        logger.warning(f"Column {col} not found in dataframe. Skipping normalization.")


In [None]:
from transformers import LogitsProcessor, LogitsProcessorList
import torch

In [None]:
# --- FLAN-T5 Initialization ---
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(config.device)

# --- Build Domain Appraisal Dataset ---
def build_domain_appraisal_dataset(df, question_encoder, context_encoder, question_tokenizer, context_tokenizer, num_contexts=3):
    """Builds a dataset for domain appraisal using the DPR model."""
    data = []
    question_encoder.eval()
    context_encoder.eval()
    for _, row in df.iterrows():
        domain_name = row['Domain Name']
        question = f"What is the estimated value of {domain_name}?"

        # DPR Retrieval
        query_input = question_tokenizer(question, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
        with torch.no_grad(), torch.autocast(device_type=config.device.type, dtype=config.bf16, enabled=(config.device.type == 'cuda')):
            query_embedding = question_encoder(**query_input).pooler_output

        # --- Corrected Context Embedding Retrieval ---
        contexts = [create_context_string(r) for _, r in df.iterrows()]
        similarities = []
        for context in contexts:  # Iterate through contexts *individually*
            context_input = context_tokenizer(context, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
            with torch.no_grad(), torch.autocast(device_type=config.device.type, dtype=config.bf16, enabled=(config.device.type == 'cuda')):
                context_embedding = context_encoder(**context_input).pooler_output
            similarity = torch.matmul(query_embedding, context_embedding.T).squeeze()
            similarities.append(similarity.item())  # Store as a Python float

        # Get top indices (no longer a tensor operation)
        similarities = np.array(similarities) #Convert to numpy
        top_indices = np.argsort(similarities)[::-1][:num_contexts] #Numpy argsort
        retrieved_contexts = [contexts[i] for i in top_indices]

        appraisal = row['Price']

        data.append({
            "domain_name": domain_name,
            "context": " ".join(retrieved_contexts),
            "appraisal": appraisal
        })
    question_encoder.train()
    context_encoder.train()
    return data

# --- Create Domain Appraisal Dataset ---
appraisal_data = build_domain_appraisal_dataset(df, question_encoder, context_encoder, question_tokenizer, context_tokenizer)

# --- Split data into training and validation sets ---
train_data, eval_data = train_test_split(appraisal_data, test_size=0.2, random_state=42)

# --- Helper Function for Training/Eval Prompt ---
def prepare_train_eval_prompt(domain_name, context):
    return f"Domain: {domain_name}\nContext: {context}\nAppraisal (output a single number only):"

# --- Prepare Data for FLAN-T5 (Training) ---
train_texts = [prepare_train_eval_prompt(d['domain_name'], d['context']) for d in train_data]
train_labels = [str(d['appraisal']) for d in train_data]

# --- Tokenize data (Training) ---
train_encodings = t5_tokenizer(train_texts, truncation=True, padding=True, max_length=512)
train_labels_encodings = t5_tokenizer(train_labels, truncation=True, padding=True, max_length=64)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import re

In [None]:
# --- Create PyTorch dataset ---
class AppraisalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Directly use the tokenized labels
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

# Use the tokenized labels directly
train_dataset = AppraisalDataset(train_encodings, train_labels_encodings)


# --- Prepare Data for FLAN-T5 (Evaluation) ---
eval_texts = [prepare_train_eval_prompt(d['domain_name'], d['context']) for d in eval_data]
eval_labels = [str(d['appraisal']) for d in eval_data]

# --- Tokenize data (Evaluation) ---
eval_encodings = t5_tokenizer(eval_texts, truncation=True, padding=True, max_length=512)
eval_labels_encodings = t5_tokenizer(eval_labels, truncation=True, padding=True, max_length=64)

# --- Create PyTorch dataset (Evaluation) ---
eval_dataset = AppraisalDataset(eval_encodings, eval_labels_encodings)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,  # Or more!
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    report_to="tensorboard",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rmse",  # Use RMSE for model selection
    greater_is_better=False,          # Lower RMSE is better
    fp16=False,
    bf16= config.device.type == 'cuda' and torch.cuda.is_bf16_supported(),
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    logging_dir='./logs',
    logging_steps=10,
    seed=42,
    skip_memory_metrics=True,
)

# --- Compute Metrics ---
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_metrics(pred):
    """Computes evaluation metrics (including RMSE)."""
    labels = pred.label_ids
    preds = pred.predictions[0]

    predicted_token_ids = preds.argmax(-1)

    # Filter out token IDs outside of the tokenizer's vocabulary
    predicted_token_ids = [
        [token_id for token_id in seq if token_id < t5_tokenizer.vocab_size]
        for seq in predicted_token_ids
    ]

    decoded_preds = t5_tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
    decoded_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)

    float_preds = []
    float_labels = []
    for p, l in zip(decoded_preds, decoded_labels):
        try:
            # Stricter Regex: Only match valid floats/ints, including scientific notation
            p_match = re.search(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$", p)  # ^ and $ for whole string
            l_match = re.search(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$", l)  # Stricter regex on labels too.

            if p_match and l_match:  # Check BOTH predictions and labels
                float_preds.append(float(p_match.group(0)))
                float_labels.append(float(l_match.group(0)))
            else:
                print(f"Warning: Could not extract float from '{p}' or '{l}'. Skipping.")
                continue  # Skip this example

        except (ValueError, TypeError) as e:
            print(f"Warning: Could not convert '{p}' or '{l}' to float. Error: {e}. Skipping.")
            continue

    if not float_preds:
        print("Warning: No valid predictions after filtering. Returning default metrics.")
        return {"eval_mse": 0.0, "eval_mae": 0.0, "eval_rmse": 0.0, "eval_r2": -1e5}  # Return R^2

    float_preds = np.array(float_preds).reshape(-1, 1)
    float_labels = np.array(float_labels).reshape(-1, 1)

    # Clip *before* inverse transform (on the normalized scale)
    float_preds = np.clip(float_preds, 0.0, 1.0)
    float_labels = np.clip(float_labels, 0.0, 1.0)

    # Inverse transform (log and min-max)
    original_scale_preds = price_scaler.inverse_transform(float_preds)
    original_scale_labels = price_scaler.inverse_transform(float_labels)
    original_scale_preds = np.expm1(original_scale_preds)
    original_scale_labels = np.expm1(original_scale_labels)

    # Ensure no negative values after inverse transform AND clip large values
    original_scale_preds = np.maximum(0, original_scale_preds)
    original_scale_labels = np.maximum(0, original_scale_labels)
    original_scale_preds = np.clip(original_scale_preds, a_min=0, a_max=1e9)  # Clip large values
    original_scale_labels = np.clip(original_scale_labels, a_min=0, a_max=1e9)

    # Use scikit-learn for metrics - CORRECTED, FINAL TIME!
    mse = mean_squared_error(original_scale_labels, original_scale_preds)  # No evaluate.load
    mae = mean_absolute_error(original_scale_labels, original_scale_preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(original_scale_labels, original_scale_preds)

    return {"eval_mse": mse, "eval_mae": mae, "eval_rmse": rmse, "eval_r2":r2} # Corrected Key



In [None]:
# --- Constrained Decoding (Logits Processor) ---
class NumberLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        # Get the token IDs for digits, decimal, +, -, e, and EOS
        self.allowed_token_ids = []
        for i in range(10):
            self.allowed_token_ids.extend(self.tokenizer(str(i), add_special_tokens=False).input_ids)

        self.allowed_token_ids.extend(self.tokenizer(".", add_special_tokens=False).input_ids)
        self.allowed_token_ids.extend(self.tokenizer("-", add_special_tokens=False).input_ids)
        self.allowed_token_ids.extend(self.tokenizer("+", add_special_tokens=False).input_ids)
        self.allowed_token_ids.extend(self.tokenizer("e", add_special_tokens=False).input_ids)
        self.allowed_token_ids.extend(self.tokenizer("E", add_special_tokens=False).input_ids)
        self.eos_token_id = self.tokenizer.eos_token_id
        if self.eos_token_id is not None:
            self.allowed_token_ids.append(self.eos_token_id)
        self.allowed_token_ids = list(set(self.allowed_token_ids)) # Remove duplicates.
        # Create a tensor of allowed token IDs on the correct device
        self.allowed_tokens_tensor = torch.tensor(self.allowed_token_ids, device=config.device)


    def __call__(self, input_ids, scores):
      # Create a mask where True indicates a disallowed token
      bad_tokens_mask = torch.ones(scores.shape, dtype=torch.bool, device=scores.device)

      # Set allowed tokens to False in the mask
      for token_id in self.allowed_token_ids:
          bad_tokens_mask[:, token_id] = False  # Correct masking

      # Set scores for disallowed tokens to -inf
      scores = scores.masked_fill(bad_tokens_mask, -float("inf"))
      return scores

# Create the LogitsProcessor
logits_processor = LogitsProcessorList([NumberLogitsProcessor(t5_tokenizer)])

# --- Custom Trainer (to use logits_processor during evaluation) ---
from transformers import Trainer
import torch.nn as nn

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        self.logits_processor = kwargs.pop("logits_processor", None)  # Get logits_processor
        super().__init__(*args, **kwargs)

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        """
        Override the evaluate method to use our custom compute_metrics.
        """
        eval_dataloader = self.get_eval_dataloader(eval_dataset)

        # Use prediction_loop, but DO NOT pass logits_processor here
        output = self.prediction_loop(
            eval_dataloader,
            description="Evaluation",
            prediction_loss_only=False,  # We need predictions
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
            # logits_processor=logits_processor,  # REMOVE THIS LINE
        )

        # Call our custom compute_metrics function
        metrics = self.compute_metrics(output)

        # Add the eval_loss to the metrics.
        metrics["eval_loss"] = output.metrics["eval_loss"]

        self.log(metrics)
        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        How the loss is computed by Trainer.  Override to use CrossEntropy.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels) #CRITICAL
        loss = outputs.loss # Get loss from the outputs
        return (loss, outputs) if return_outputs else loss

    def predict(self, test_dataset, ignore_keys=None, metric_key_prefix="test"):
        """
        Override predict to use logits_processor during generation.
        """
        test_dataloader = self.get_test_dataloader(test_dataset)  # Use test dataloader

        # Call prediction_loop with prediction_loss_only=True to get the loss
        output = self.prediction_loop(
            test_dataloader,
            description="Prediction",
            prediction_loss_only=True,  # Only get the loss
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        # Now, generate predictions with constrained decoding
        with torch.no_grad():
            generated_predictions = []
            for batch in test_dataloader:
                inputs = self._prepare_inputs(batch)  # Prepare inputs
                labels = inputs.pop("labels",None)
                generated_tokens = self.model.generate(
                    **inputs,
                    max_new_tokens=64,  # Or your desired max length
                    logits_processor=self.logits_processor,  # Use logits_processor here
                )
                generated_predictions.append(generated_tokens)

            # Convert to numpy arrays.  Pad if necessary
            max_length = max(x.shape[1] for x in generated_predictions)

            # Convert list of tensors to a single tensor, padding as needed
            padded_predictions = []
            for x in generated_predictions:
                padding_size = max_length - x.shape[1]
                padding = torch.full((x.shape[0], padding_size), self.tokenizer.pad_token_id, device=x.device)
                padded_tensor = torch.cat([x, padding], dim=1)  # Concatenate along sequence dimension
                padded_predictions.append(padded_tensor)

            predictions_tensor = torch.cat(padded_predictions, dim=0)


            if labels is not None:
              labels = self._pad_across_processes(labels) # Pad
              labels = self._nested_gather(labels)
              labels = labels.cpu().numpy()
            else:
              labels = None


        return (output.metrics["test_loss"], predictions_tensor.cpu().numpy(), labels)

# --- Fine-tuning with CustomTrainer ---
# Create the LogitsProcessor
logits_processor = LogitsProcessorList([NumberLogitsProcessor(t5_tokenizer)])

trainer = CustomTrainer(
    model=t5_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    logits_processor=logits_processor,  # Pass it to the CustomTrainer
)

trainer.train()
trainer.save_model("./fine_tuned_flan_t5")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Mse,Mae,Rmse,R2
1,10.4993,7.420366,0.249414,0.482447,0.499414,-16.327134
2,8.6331,6.716347,0.260656,0.485929,0.510545,-16.436541
3,7.9309,6.225017,0.232892,0.452581,0.482589,-15.736228
4,6.772,5.782995,0.159451,0.326187,0.399313,-10.088737
5,6.8618,5.101319,0.130596,0.248729,0.36138,-5.263808
6,6.0758,4.483907,0.147968,0.335002,0.384666,-9.2294
7,5.5782,4.412287,0.177078,0.372019,0.420806,-10.055709
8,5.3806,4.353143,0.193256,0.388952,0.439608,-13.481035
9,5.1697,4.30428,0.172414,0.354474,0.415227,-11.364535
10,5.152,4.287903,0.175373,0.357633,0.418775,-11.576712


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.








































There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


In [None]:
from pytrends.request import TrendReq  # Import pytrends
import time
import random

In [None]:
# --- Keyword Extraction (Example - Refine This!) ---
def extract_keywords(domain_name, existing_keywords):
    """Extracts relevant keywords from the domain name and existing keywords."""
    keywords = []
    # Basic cleaning: Remove hyphens, split into words
    parts = domain_name.replace('-', ' ').replace('.com', '').replace('.net', '').replace('.org','').split()
    keywords.extend(parts)

    if isinstance(existing_keywords, str):
        keywords.extend(existing_keywords.split())  # Add existing keywords

    # Remove duplicates and very short keywords
    keywords = list(set(keyword.lower() for keyword in keywords if len(keyword) > 2))
    # Add a limit.  IMPORTANT!
    return keywords[:3]  # Limit to, at most, the first 3 keywords

# --- In-memory cache for Google Trends data ---
trends_cache = {}

# --- Google Trends Query Function ---
def get_google_trends_data(keywords, geo='', timeframe='today 5-y'):
    """
    Retrieves Google Trends data for a list of keywords, with caching and retries.
    """
    # Create a cache key. Sort keywords so order doesn't matter.
    keyword_key = tuple(sorted(keywords))
    cache_key = (keyword_key, geo, timeframe)

    if cache_key in trends_cache:
        print("Using cached Trends data.")
        return trends_cache[cache_key]

    pytrends = TrendReq(hl='en-US', tz=360)  # Initialize pytrends

    # Handle rate limits with retries and timeouts
    retries = 5  # Increased retries
    for attempt in range(retries):
        try:
            pytrends.build_payload(keywords, cat=0, timeframe=timeframe, geo=geo, gprop='')
            data = pytrends.interest_over_time()
            if 'isPartial' in data.columns:
              data.drop(columns='isPartial', inplace=True)
            # Cache the result
            trends_cache[cache_key] = data
            return data
        except Exception as e:
            print(f"Error retrieving Trends data (attempt {attempt+1}/{retries}): {e}")
            # Wait before retrying.  Increase delay each retry.
            time.sleep(random.uniform(30, 60) + 10 * attempt)

    print("Failed to retrieve Trends data after multiple retries.")
    return None  # Return None if all retries fail


# --- Modified Context Creation ---
def create_context_string(row, trends_data=None):
    def safe_float(value):
       try:
          return f"{float(value):.4f}" if value is not None else "N/A"
       except (ValueError, TypeError):
          return "N/A"

    price_bins = np.digitize(row['Price'], bins=[0, 0.25, 0.5, 0.75, 1]) if 'Price' in row else "N/A"
    # Use the safe_float function for numeric columns:
    monthly_searches = safe_float(row.get('Monthly Searches'))
    cpc_exact = safe_float(row.get('CPC (Exact)'))
    cpc_phrase = safe_float(row.get('CPC (Phrase)'))

    context_parts = [
      f"Domain: {row.get('Domain Name', 'N/A')}",
      f"Price: {price_bins}", #Price Bins will always format
      f"Date: {row.get('Date', 'N/A')}",
      f"Keywords: {row.get('Keywords', 'N/A')}",
      f"Monthly Searches: {monthly_searches}",
      f"CPC (Exact): {cpc_exact}",
      f"CPC (Phrase): {cpc_phrase}",
      f"Length: {row.get('Length', 'N/A')}",
      f"Hyphens: {not row.get('Excludes Hyphens', True)}",
      f"Category: {row.get('Category', 'N/A')}"
    ]
    if trends_data is not None and not trends_data.empty:
        avg_interest = trends_data.mean().to_dict()
        for keyword, interest in avg_interest.items():
            context_parts.append(f"Trends ({keyword}): {interest:.2f}") #Format to 2 decimal places.
    return ", ".join(context_parts)

In [None]:
def generate_answer(question, context, domain_data, constrain_number=False): #add constrain_number
    """Generates an answer using FLAN-T5, with explicit factor instructions."""

    # Construct the prompt, including data for the specific domain
    prompt = f"""You are an expert in domain appraisal.  Provide a detailed appraisal of the domain name,
including a numerical estimate on a new line, followed by your justification.

Consider these factors:

1.  Domain: {domain_data.get('Domain Name', 'N/A')}
2.  Category: {domain_data.get('Category', 'N/A')}
3.  Keywords: {domain_data.get('Keywords', 'N/A')}
4.  Length: {domain_data.get('Length', 'N/A')}
5.  Hyphens: {not domain_data.get('Excludes Hyphens', True)}  # Inverted for clarity
6.  Monthly Searches (Log Scale): {domain_data.get('Monthly Searches', 'N/A')}
7.  CPC (Exact Match) (Normalized): {domain_data.get('CPC (Exact)', 'N/A')}
8.  CPC (Phrase Match) (Normalized): {domain_data.get('CPC (Phrase)', 'N/A')}

Context (information about similar domains):
{context}

Question: {question}

Appraisal:
"""

    inputs = t5_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(config.device)

    with torch.no_grad():
        if constrain_number: #use constrain_number
          outputs = t5_model.generate(
              **inputs,
              max_new_tokens=64,  # Shorter length for just the number
              logits_processor=logits_processor, # Use constrained decoding
              num_beams=1,  # For constrained decoding, beam search isn't necessary
          )
        else:
          outputs = t5_model.generate(
              **inputs,
              max_new_tokens=256,  # Longer length for explanation
              num_beams=5,
              early_stopping=True,
              temperature=0.7,
              top_k=50,
              top_p=0.95,
              no_repeat_ngram_size=2,
          )
        answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return answer

def generate_appraisal(domain_name, df, question_encoder, context_encoder, question_tokenizer, context_tokenizer, t5_model, t5_tokenizer, price_scaler, num_contexts=3):

    # --- 1. DPR Retrieval --- (Same as before)
    question_encoder.eval()  # Ensure eval mode
    context_encoder.eval()

    question = f"What is the estimated value of {domain_name}?"
    query_input = question_tokenizer(question, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)

    with torch.no_grad(), torch.autocast(device_type=config.device.type, dtype=config.bf16, enabled=(config.device.type == 'cuda')):
        query_embedding = question_encoder(**query_input).pooler_output

    contexts = [create_context_string(r) for _, r in df.iterrows()]
    retrieved_contexts = []  # Initialize before the loop
    similarities = []
    for context in contexts:
        context_input = context_tokenizer(context, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
        with torch.no_grad(), torch.autocast(device_type=config.device.type, dtype=config.bf16, enabled=(config.device.type == 'cuda')):
            context_embedding = context_encoder(**context_input).pooler_output
        similarity = torch.matmul(query_embedding, context_embedding.T).squeeze()
        similarities.append(similarity.item())

    similarities = np.array(similarities)
    top_indices = np.argsort(similarities)[::-1][:num_contexts]
    retrieved_contexts = [contexts[i] for i in top_indices]

    # --- 2. Prepare Data for Prompt (Handles Missing Data) ---

    # Extract keywords from the domain name itself
    keywords = extract_keywords(domain_name, "")  # Pass empty string for existing keywords
    keywords_str = ", ".join(keywords) if keywords else "N/A"

    # Get Google Trends data (even for new domains)
    trends_data = get_google_trends_data(keywords)

    # Create a dictionary with DEFAULT values.  We'll *only* provide
    # the information we can reasonably get for a *new* domain.
    domain_data = {
        'Domain Name': domain_name,
        'Category': 'N/A',  # We don't know the category
        'Keywords': keywords_str,
        'Length': len(domain_name),
        'Excludes Hyphens': True,  # Assume no hyphens unless we know otherwise
        'Monthly Searches': 'N/A',  # We can't know this without the original data source
        'CPC (Exact)': 'N/A',      # We can't know this without the original data source
        'CPC (Phrase)': 'N/A',     # We can't know this without the original data source
    }


    # --- 3. Flan-T5 Generation (Two-Step Process) ---
    context_string = " ".join(retrieved_contexts)
    # Add trends data to the context if it exists
    if trends_data is not None and not trends_data.empty:
        context_string = create_context_string(domain_data, trends_data=trends_data)

    # First, get the numerical appraisal (constrained decoding)
    appraisal_number = generate_answer(question, context_string, domain_data, constrain_number=True)


    # Now, generate the full answer with explanation (unconstrained)
    full_answer = generate_answer(question, context_string, domain_data, constrain_number=False)

    # --- 4. Post-Processing ---
    try:
        # Use regex to find the *first* numerical value in the answer.
        match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", appraisal_number) #Search number in appraisal number
        if match:
            numerical_appraisal_str = match.group(0)  # Get matched string
            numerical_appraisal = float(numerical_appraisal_str)


            # Inverse transform the numerical appraisal
            numerical_appraisal = price_scaler.inverse_transform(np.array([[numerical_appraisal]]))[0][0]  # Reshape for scaler
            numerical_appraisal = np.expm1(numerical_appraisal)
            numerical_appraisal = max(0, numerical_appraisal)  # Ensure non-negative

            # Find the start of the explanation (after the number)
            explanation_start = match.end()
            # Use full_answer instead of answer
            explanation = full_answer[explanation_start:].strip()


            return f"Appraisal: {numerical_appraisal:.2f}\n\nExplanation:\n{explanation}"
        else: #If no number extracted.
            return f"Could not extract a numerical appraisal. Flan-T5 output:\n{full_answer}"

    except (ValueError, TypeError) as e:
        # Use full_answer instead of answer
        return f"Error during appraisal generation: {str(e)}\nFlan-T5 Output:\n{full_answer}"

In [None]:
# --- Load the best DPR model ---
best_dpr_state_dict = torch.load(config.output_model_path)
question_encoder.load_state_dict(best_dpr_state_dict['question_encoder'])
context_encoder.load_state_dict(best_dpr_state_dict['context_encoder'])

# --- Load the fine-tuned Flan-T5 model ---
t5_model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_flan_t5").to(config.device) # Make sure the path is correct

# Get user input
domain_to_appraise = input("Enter the domain name to appraise: ")

# Generate the appraisal
appraisal = generate_appraisal(
    domain_to_appraise,
    df,  # Your *preprocessed* DataFrame
    question_encoder,
    context_encoder,
    question_tokenizer,
    context_tokenizer,
    t5_model,
    t5_tokenizer,
    price_scaler,
    num_contexts=3
)

print(appraisal)

  best_dpr_state_dict = torch.load(config.output_model_path)


Enter the domain name to appraise: cookies.com
Using cached Trends data.




Appraisal: 0.00

Explanation:
A
