<a href="https://colab.research.google.com/github/angelaxli/DomainRAG/blob/main/DomainRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Project: Domain Name Appraisal using a Retrieval-Augmented Generation Pipeline**

Usage:
1. Download the historical sales dataset (https://docs.google.com/spreadsheets/d/1SmEplaRY2-a6xt_fAyK1vrtANuZHTqZnXlt5bpmEdHs/edit?gid=0#gid=0).
  - The name of your file should be "Knowledge Base Real (11).csv".
2. Then, you can click Runtime in the menu above and press Run all.
3. At the last cell, you should be prompted to input a domain name and the category of the name.
  - The category should be one of the listed or simply input Unknown.

Note: Still undergoing improvements

In [1]:
!pip install pandas faiss-cpu numpy pytrends transformers datasets torch scikit-learn evaluate wordsegment newsapi-python nltk

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvr

In [2]:
import torch
import evaluate
import faiss
import gc
import pandas as pd
import numpy as np
from transformers import (
    DPRQuestionEncoder,
    DPRContextEncoder,
    DPRQuestionEncoderTokenizer,
    DPRContextEncoderTokenizer,
    get_linear_schedule_with_warmup,
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from torch.utils.tensorboard import SummaryWriter
import logging
import os
from scipy.stats import mstats  # Import for winsorizing
from transformers import Trainer  # Import the base Trainer class
import torch.nn as nn
import re
import wordsegment
from datetime import datetime, timedelta
from newsapi.newsapi_exception import NewsAPIException
import json
import nltk
from nltk.tokenize import word_tokenize
import time
from transformers import AdamW
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import ndcg_score, average_precision_score
import logging
import gc #Garbage collection
import xgboost as xgb
from newsapi import NewsApiClient

In [3]:
# --- NLTK Setup (Corrected) ---
try:
    nltk.data.find('punkt')
except LookupError:
    nltk.download('punkt')  # Download the complete 'punkt' resource

wordsegment.load()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration (Keep your Config class)
class Config:
    csv_path = "/content/Knowledge Base Real (11).csv"  # Your CSV
    question_encoder_model = "facebook/dpr-question_encoder-single-nq-base"
    context_encoder_model = "facebook/dpr-ctx_encoder-single-nq-base"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    max_length = 64  # For DPR
    output_model_path = "best_dpr_model.pth"  # DPR model path (for *loading*)
    faiss_index_path = "dpr_faiss.index"  # FAISS index path (for *loading*)
    news_api_key = "YOUR_NEWSAPI_ORG_API_KEY"   # REPLACE WITH YOUR KEY!
    flan_t5_model = "google/flan-t5-base" # Or "google/flan-t5-small", etc.
    days_past = 14
    xgb_model_path = "best_xgboost_model.json" # Path for XGBoost Model
    tensorboard_log_dir = "runs/dpr_training"

    # --- DPR Training Parameters (added) ---
    batch_size = 8       # Example batch size - adjust as needed!
    epochs = 10          # Example number of epochs
    patience = 2        # Early stopping patience
    accumulation_steps = 4 # Gradient accumulation steps
    learning_rate = 1e-5 # Example learning rate
    warmup_steps = 50    # Warmup steps for learning rate scheduler
    max_grad_norm = 1.0   # Gradient clipping
    log_every = 10     # Log training progress every N steps
    num_negatives = 5 # The amount of hard negatives we want.

config = Config()

# --- Data Preprocessing (Keep your preprocessing) ---
try:
    df = pd.read_csv(config.csv_path)
    df = df.dropna()
    df = df.drop_duplicates()

    # Convert 'Date' to datetime objects (CRITICAL for correct trend calculation)
    try:
        df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
    except ValueError:
        try:
            df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')  # Handle other formats if needed
        except ValueError:
            logger.error("Could not parse 'Date' column.  Please check the format in your CSV.")
            exit(1)

    # Keep original price BEFORE transformations
    df['original_price'] = df['Price']

    # Winsorize and Log Transform Price (for DPR training data, keep original)
    df['Price'] = mstats.winsorize(df['Price'], limits=[0.01, 0.01])
    df['Price'] = np.log1p(df['Price']) # Keep for embeddings.

    # Normalize price (and other numeric columns if you use them in the context)
    price_scaler = MinMaxScaler()
    df['Price'] = price_scaler.fit_transform(df[['Price']])

     # Normalize 'Monthly Searches' (and other numeric columns if you use them in DPR context)
    if 'Monthly Searches' in df.columns:
        other_scaler = MinMaxScaler() # Use different scalers
        df['Monthly Searches'] = other_scaler.fit_transform(df[['Monthly Searches']])
    if 'CPC (Exact)' in df.columns:
        other_scaler = MinMaxScaler()  # Reinitialize or use separate scalers for each
        df['CPC (Exact)'] = other_scaler.fit_transform(df[['CPC (Exact)']])
    if 'CPC (Phrase)' in df.columns:
        other_scaler = MinMaxScaler()
        df['CPC (Phrase)'] = other_scaler.fit_transform(df[['CPC (Phrase)']])


except FileNotFoundError:
    logger.error(f"Error: CSV file not found at {config.csv_path}")
    exit(1)

In [21]:
# --- Model and Tokenizer Initialization (Keep this) ---
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(config.question_encoder_model)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(config.context_encoder_model)

question_encoder = DPRQuestionEncoder.from_pretrained(config.question_encoder_model).to(config.device)
context_encoder = DPRContextEncoder.from_pretrained(config.context_encoder_model).to(config.device)

# --- Keyword Preprocessing (Keep your function) ---
def preprocess_keywords(keywords):
    if isinstance(keywords, str):
        keywords = keywords.lower()
        tokens = context_tokenizer.tokenize(keywords)
        tokens = [token for token in tokens if token.isalnum() and token not in context_tokenizer.all_special_tokens]
        return context_tokenizer.convert_tokens_to_string(tokens)
    else:
        return ""

df['Keywords'] = df['Keywords'].apply(preprocess_keywords)

# --- Context Creation Function (Keep your function) ---
def create_context_string_train(row):
    """Creates the context string for DPR *TRAINING*, including trend_score."""
    context_parts = [
        f"Domain: {row.get('Domain Name', 'N/A')}",
        f"Price: {row.get('Price', 'N/A')}",  # ORIGINAL price
        f"Keywords: {row.get('Keywords', 'N/A')}",
        f"Monthly Searches: {row.get('Monthly Searches', 'N/A')}",
        f"CPC (Exact): {row.get('CPC (Exact)', 'N/A')}",
        f"CPC (Phrase): {row.get('CPC (Phrase)', 'N/A')}",
        f"Length: {row.get('Length', 'N/A')}",
        f"Hyphens: {not row.get('Excludes Hyphens', True)}",
        f"Category: {row.get('Category', 'N/A')}",
        f"Trend Score: {row.get('Trend Score', 'N/A')}"  # Include trend_score
    ]
    return ", ".join(context_parts)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

In [6]:
def generate_similarity_based_hard_negatives(df, context_encoder, context_tokenizer, num_negatives=3):
    """
    Generates hard negatives using FAISS.
    """
    context_encoder.eval()  # Ensure eval mode
    hard_negatives_by_index = {}
    context_embeddings = []

    with torch.no_grad():
        # 1. Generate embeddings *individually* for each context
        for _, row in df.iterrows():
            context = create_context_string_train(row)  # Use the training version
            context_input = context_tokenizer(context, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
            with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
                embedding = context_encoder(**context_input).pooler_output.detach().cpu().float().numpy()
            if embedding.size > 0:
                context_embeddings.append(embedding.flatten())
            else:
                logger.warning(f"Empty embedding for row index. Skipping.")

    # 2. Build FAISS index *after* getting all embeddings
    context_embeddings = np.array(context_embeddings)

    if context_embeddings.size == 0:
        logger.warning("No valid context embeddings. Returning empty hard_negatives_by_index.")
        return hard_negatives_by_index

    index = faiss.IndexFlatL2(context_embeddings.shape[1])
    index.add(context_embeddings)

    # 3. Find hard negatives
    for i, row in df.iterrows():
        current_embedding = context_embeddings[i:i + 1]
        if current_embedding.size > 0:
          D, I = index.search(current_embedding, k=num_negatives + 1)
          similar_indices = [idx for idx in I.flatten() if idx != i][:num_negatives]
          hard_negatives_by_index[i] = similar_indices

    context_encoder.train() # Switch back to training mode
    return hard_negatives_by_index


def prepare_triplets_with_negatives(df, hard_negatives_by_index=None, epoch=None):
    """
    Prepares triplets with a focus on *much* harder negatives and a warm-up strategy.
    Includes extensive diagnostic printing to understand the generated triplets.
    """
    triplets = []
    for idx, row in df.iterrows():
        question = f"What is the estimated value of {row['Domain Name']}?"
        positive_context = create_context_string_train(row)  # Use training context

        # --- 1. Hard Negatives from FAISS (if available) ---
        hard_negatives = []
        if hard_negatives_by_index is not None and idx in hard_negatives_by_index:
            for hard_negative_idx in hard_negatives_by_index[idx]:
                hard_negative = df.iloc[hard_negative_idx]
                hard_negatives.append(create_context_string_train(hard_negative))  # Training context


        # --- 2. Stricter Fallback (if FAISS doesn't provide enough) ---
        # if  epoch is not None and epoch < 2: # Warm up. #Removed warm up
        #     random_negative = df.sample(1).iloc[0]
        #     random_negative_context = create_context_string_train(random_negative) # Use training context
        #     # During warm-up epochs, use *only* a random negative.
        #     negatives = [random_negative_context]
        #     logger.info(f"--- Triplet {idx} (WARM-UP EPOCH) ---")
        #     logger.info(f"  Question: {question}")
        #     logger.info(f"  Positive Context: {positive_context}")
        #     logger.info(f"  Random Negative Context: {random_negative_context}")
        #     logger.info("-" * 30)

        # else: #After warming up.
        if not hard_negatives: # Only if we don't have FAISS negatives
            # 1. MUST be the same Category
            candidates = df[df['Category'] == row['Category']].copy()  # USE .copy()!

            # 2. MUCH Tighter Price Range (on the TRANSFORMED scale)
            candidates = candidates[
                (candidates['Price'] >= row['Price'] - 0.02) & (candidates['Price'] <= row['Price'] + 0.02)
            ]

            # 3. Keyword Overlap (Stricter - at least 2 keywords, case-insensitive)
            if 'Keywords' in row and isinstance(row['Keywords'], str):
                def keyword_overlap_count(x):
                    if isinstance(x, str):
                        return sum(1 for k in str(x).lower().split() if k in row['Keywords'].lower())
                    else:
                        return 0
                candidates['overlap_count'] = candidates['Keywords'].apply(keyword_overlap_count)
                candidates = candidates[candidates['overlap_count'] >= 2] # Require >= 2 keywords
                candidates = candidates.drop(columns=['overlap_count'], errors='ignore')


            # 4.  Similar Monthly Searches (if available, and after other filters)
            if 'Monthly Searches' in df.columns and not candidates.empty:
                candidates = candidates[
                    (candidates['Monthly Searches'] >= row['Monthly Searches'] - 0.05) &
                    (candidates['Monthly Searches'] <= row['Monthly Searches'] + 0.05)
                ]
            # 5. Similar CPC (if available)
            if 'CPC (Exact)' in df.columns and not candidates.empty:
              candidates = candidates[
                  (candidates['CPC (Exact)'] >= row['CPC (Exact)'] - 0.05) &
                  (candidates['CPC (Exact)'] <= row['CPC (Exact)'] + 0.05)
              ]
            if 'CPC (Phrase)' in df.columns and not candidates.empty:
                candidates = candidates[
                    (candidates['CPC (Phrase)'] >= row['CPC (Phrase)'] - 0.05) &
                    (candidates['CPC (Phrase)'] <= row['CPC (Phrase)'] + 0.05)
                ]

            # NO LENGTH CHECK - it's a weak signal

            if len(candidates) > 1:  # We still have some candidates
                hard_negative = candidates.sample(1).iloc[0]
                hard_negatives.append(create_context_string_train(hard_negative)) # Use training context

            # We are not falling back to a random choice anymore

        random_negative = df.sample(1).iloc[0] #Still need random negative
        random_negative_context = create_context_string_train(random_negative) # Use training version
        negatives = [random_negative_context] + hard_negatives  # Combine for the dataset

        triplets.append({
            "question": question,
            "positive": positive_context,
            "random_negative": random_negative_context if 'random_negative_context' in locals() else "",
            "hard_negatives": hard_negatives
        })
    return triplets

In [7]:
# --- Dataset and DataLoader ---
class DomainDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        query = question_tokenizer(sample['question'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        positive = context_tokenizer(sample['positive'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        random_negative = context_tokenizer(sample['random_negative'], return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length)
        hard_negatives = [context_tokenizer(neg, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length) for neg in sample['hard_negatives']]

        # Move tensors to device and remove extra dimension
        return {
            'query': {k: v.squeeze(0).to(config.device) for k, v in query.items()},
            'positive': {k: v.squeeze(0).to(config.device) for k, v in positive.items()},
            'random_negative': {k: v.squeeze(0).to(config.device) for k, v in random_negative.items()},
            'hard_negatives': [{k: v.squeeze(0).to(config.device) for k, v in hard_negative.items()} for hard_negative in hard_negatives]
        }

def collate_fn(batch):
    queries = {key: torch.stack([d['query'][key] for d in batch]) for key in batch[0]['query'].keys()}
    positives = {key: torch.stack([d['positive'][key] for d in batch]) for key in batch[0]['positive'].keys()}
    random_negatives = {key: torch.stack([d['random_negative'][key] for d in batch]) for key in batch[0]['random_negative'].keys()}
    hard_negatives_list = []
    num_hard_negatives = min(len(d['hard_negatives']) for d in batch)
    for i in range(num_hard_negatives):
        hard_negatives_list.append({key: torch.stack([d['hard_negatives'][i][key] for d in batch]) for key in batch[0]['hard_negatives'][i].keys()})
    return queries, positives, random_negatives, hard_negatives_list

def train_step(query, positive, random_negative, hard_negatives, optimizer, criterion, scaler):
    optimizer.zero_grad()  # Clear gradients
    # Force float32 for both forward and backward passes
    with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
        query_embedding = question_encoder(**query).pooler_output
        positive_embedding = context_encoder(**positive).pooler_output
        random_negative_embedding = context_encoder(**random_negative).pooler_output

        scores_list = [torch.matmul(query_embedding, positive_embedding.T).diag(),
                       torch.matmul(query_embedding, random_negative_embedding.T).diag()]

        for hard_negative in hard_negatives:
            hard_negative_embedding = context_encoder(**hard_negative).pooler_output
            scores_list.append(torch.matmul(query_embedding, hard_negative_embedding.T).diag())

        scores = torch.stack(scores_list, dim=1)

        batch_size = query_embedding.size(0)
        targets = torch.zeros(batch_size, dtype=torch.long, device=config.device)

        loss = criterion(scores, targets)

    # Continue with gradient scaling and optimization
    scaler.scale(loss).backward()  # Scale the loss
    scaler.unscale_(optimizer) # Unscale before clipping
    torch.nn.utils.clip_grad_norm_(question_encoder.parameters(), max_norm=config.max_grad_norm)
    torch.nn.utils.clip_grad_norm_(context_encoder.parameters(), max_norm=config.max_grad_norm)
    scaler.step(optimizer)  # Step with scaler
    scaler.update()  # Update scaler

    with torch.no_grad(): # Calculate MRR (no gradient needed)
        ranks = torch.argsort(scores, dim=1, descending=True)
        positive_indices = (ranks == 0).nonzero(as_tuple=True)
        if len(positive_indices[0]) > 0:
            positive_ranks = positive_indices[1] + 1
            mrr = torch.mean(1.0 / positive_ranks.float())
        else:
            mrr = torch.tensor(0.0, device=config.device)

    return loss.detach(), mrr.detach() # Return detached losses


In [8]:
def evaluate_model(dataloader, question_encoder, context_encoder):
    question_encoder.eval()
    context_encoder.eval()

    all_scores = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            query, positive, random_negative, hard_negatives = batch

            query_embedding = question_encoder(**query).pooler_output
            positive_embedding = context_encoder(**positive).pooler_output

            scores_list = [torch.matmul(query_embedding, positive_embedding.T).diag()]
            for negative in [random_negative] + hard_negatives:
                negative_embedding = context_encoder(**negative).pooler_output
                scores_list.append(torch.matmul(query_embedding, negative_embedding.T).diag())

            scores = torch.stack(scores_list, dim=1)

            all_scores.extend(scores.cpu().float().numpy())  # Convert to float32
            all_labels.extend(torch.zeros(scores.shape[0], dtype=torch.int64).cpu().numpy())

    all_scores = np.array(all_scores)
    all_labels = np.array(all_labels)

    # Calculate MRR
    ranks = np.argsort(all_scores, axis=1)[:, ::-1]
    positive_ranks = (ranks == 0).nonzero()[1] + 1 if np.any(ranks == 0) else np.array([])
    mrr = np.mean(1.0 / positive_ranks) if len(positive_ranks) > 0 else 0.0

    question_encoder.train()
    context_encoder.train()
    return mrr

In [9]:
# --- Training Loop ---
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

num_training_steps = (len(train_df) // config.batch_size // config.accumulation_steps) * config.epochs
optimizer = AdamW(list(question_encoder.parameters()) + list(context_encoder.parameters()), lr=config.learning_rate)
# Re-enable warmup steps:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=num_training_steps)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler(enabled=(config.device.type == 'cuda'))

writer = SummaryWriter(log_dir=config.tensorboard_log_dir)

best_val_loss = float('inf')
best_model_state_dict = None

for epoch in range(config.epochs):
    # Generate hard negatives every epoch
    hard_negatives_by_index = generate_similarity_based_hard_negatives(train_df, context_encoder, context_tokenizer, num_negatives=config.num_negatives)


    train_triplets = prepare_triplets_with_negatives(train_df, hard_negatives_by_index, epoch)  # Pass hard_negatives_by_index
    train_dataset = DomainDataset(train_triplets)
    train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

    #Hard negatives are optional for val.
    val_triplets = prepare_triplets_with_negatives(val_df)
    val_dataset = DomainDataset(val_triplets)
    val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)


    for step, batch in enumerate(train_dataloader):
        loss, mrr = train_step(batch[0], batch[1], batch[2], batch[3], optimizer, criterion, scaler)

        if (step + 1) % config.accumulation_steps == 0:
            scheduler.step()
            optimizer.zero_grad() #Only zero when accumulated.

        if step % config.log_every == 0:
            logger.info(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}, MRR: {mrr.item()}")
            writer.add_scalar("Loss/train", loss.item(), epoch * len(train_dataloader) + step)
            writer.add_scalar("MRR/train", mrr.item(), epoch * len(train_dataloader) + step)

    val_mrr = evaluate_model(val_dataloader, question_encoder, context_encoder)
    val_loss = 1.0 - val_mrr # Lower is better


    logger.info(f"Epoch {epoch+1}, Validation MRR: {val_mrr:.4f}, Validation Loss: {val_loss:.4f}")
    writer.add_scalar("MRR/val", val_mrr, epoch)
    writer.add_scalar("Loss/val", val_loss, epoch) # Log val_loss


    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state_dict = {
            'question_encoder': question_encoder.state_dict(),
            'context_encoder': context_encoder.state_dict()
        }
        torch.save(best_model_state_dict, config.output_model_path)
        logger.info(f"Best model saved at epoch {epoch+1}")

    elif epoch >= config.patience and (val_loss >= best_val_loss):
        logger.info(f"Early stopping triggered. No improvement for {config.patience} epochs.")
        break

# Load the best model after training
if best_model_state_dict is not None:
    best_model_state_dict = torch.load(config.output_model_path, map_location=config.device)
    question_encoder.load_state_dict(best_model_state_dict['question_encoder'])
    context_encoder.load_state_dict(best_model_state_dict['context_encoder'])
    logger.info("Best model loaded.")
else:
    logger.warning("Warning: No best model found. This can happen if training was interrupted.")

# --- FAISS Index Building (After Training) ---

context_encoder.eval()
index = faiss.IndexFlatL2(768)

with torch.no_grad():
    for idx, row in df.iterrows():
        context = create_context_string_train(row)
        context_input = context_tokenizer(context, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
        with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
          embedding = context_encoder(**context_input).pooler_output.detach().cpu().float().numpy()

        if embedding.size > 0:
              index.add(embedding.reshape(1, -1)) #FAISS needs 2D array.
        else:
            logger.warning(f"Empty embedding for row index {idx}. Skipping.")

faiss.write_index(index, config.faiss_index_path)
logger.info(f"FAISS index built and saved to {config.faiss_index_path}")

torch.cuda.empty_cache()
gc.collect()
writer.close()

print("Training and Index Building complete.")

  scaler = torch.cuda.amp.GradScaler(enabled=(config.device.type == 'cuda'))
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda')):
  best_model_state_dict = torch.load(config.output_model_path, map_location=config.device)
  with torch.cuda.amp.autocast(dtype=torch.float32, enabled=(config.device.type == 'cuda

Training and Index Building complete.


In [10]:
import faiss

index = faiss.read_index("dpr_faiss.index")

In [11]:
print(f"Index is trained: {index.is_trained}")  # Should be True
print(f"Number of vectors in index: {index.ntotal}")  # Should match # of rows in CSV
print(f"Dimension of vectors: {index.d}")      # Should be 768 (for DPR)

Index is trained: True
Number of vectors in index: 149
Dimension of vectors: 768


In [12]:
# --- Configuration ---
class Config:
    csv_path = "/content/Knowledge Base Real (11).csv"  # Your CSV
    question_encoder_model = "facebook/dpr-question_encoder-single-nq-base"
    context_encoder_model = "facebook/dpr-ctx_encoder-single-nq-base"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    max_length = 64  # For DPR
    output_model_path = "best_dpr_model.pth"  # DPR model path (for *loading*)
    faiss_index_path = "dpr_faiss.index"  # FAISS index path (for *loading*)
    news_api_key = "e8be2b7d92b147a686526909dcdfc648"   # REPLACE WITH YOUR KEY!
    flan_t5_model = "google/flan-t5-base" # Or "google/flan-t5-small", etc.

config = Config()

In [13]:
# --- Keyword Extraction (Final Version) ---
def extract_keywords(domain_name):
    """Extracts keywords from a domain name."""
    domain_without_tld = domain_name.split('.')[0]
    parts = re.split(r'[-_]+', domain_without_tld)
    keywords = []

    for part in parts:
        if not part.isdigit():
            try:
                segmented_words = wordsegment.segment(part)
                keywords.extend([word.lower() for word in segmented_words if len(word) > 1])
            except Exception as e:
                print(f"Word segmentation failed for '{part}': {e}.  Falling back.")
                sub_parts = re.findall(r'[a-zA-Z]+|[0-9]+', part)
                for sub_part in sub_parts:
                    if not sub_part.isdigit():
                        try:
                            segmented_sub_words = wordsegment.segment(sub_part)
                            keywords.extend([w.lower() for w in segmented_sub_words if len(w) > 1])
                        except Exception as e2:
                            print(f"  Sub-segmentation failed for '{sub_part}': {e2}. Keeping as is (if > 1).")
                            if len(sub_part) > 1:
                                keywords.append(sub_part.lower())
                    else:
                        keywords.append(sub_part)
        else:
            keywords.append(part)

    cleaned_domain_name = domain_without_tld.replace("-", " ").replace("_", " ").strip()

    if cleaned_domain_name.lower() != " ".join(keywords).lower():
        keywords.append(cleaned_domain_name)

    unique_keywords = []
    for keyword in keywords:
        if keyword not in unique_keywords:
            unique_keywords.append(keyword)

    return list(unique_keywords)  # Return as a *list*

# --- NewsAPI Integration ---
def get_news_article_count(api_key, keywords, days_past=7, cache_dir="news_counts_cache"):
    """Fetches news article count (past 14 days)."""
    newsapi = NewsApiClient(api_key=api_key)
    os.makedirs(cache_dir, exist_ok=True)

    if not keywords:
        print("No keywords extracted. Returning article count of 0.")
        return 0

    query_string = " OR ".join(keywords)
    cache_key = query_string.replace(" ", "_") + f"_{days_past}"
    cache_file = os.path.join(cache_dir, f"{cache_key}.json")

    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'r') as f:
                cached_data = json.load(f)
            if datetime.fromisoformat(cached_data['timestamp']).date() >= (datetime.now() - timedelta(days=1)).date():
                print(f"Using cached news count for query: {query_string}")
                return cached_data['count']
        except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
            print(f"Error reading cache for '{query_string}': {e}.  Ignoring cache.")

    try:
        from_date = (datetime.now() - timedelta(days=days_past)).strftime('%Y-%m-%d')
        retries = 3
        for attempt in range(retries):
            try:
                response = newsapi.get_everything(q=query_string,
                                                  from_param=from_date,
                                                  language='en',
                                                  sort_by='relevancy',
                                                  page_size=1,
                                                  page=1)

                if response['status'] == 'ok':
                    article_count = response['totalResults']
                    article_count = min(article_count, 100)

                    with open(cache_file, 'w') as f:
                        json.dump({'timestamp': datetime.now().isoformat(), 'count': article_count}, f)

                    return article_count

                elif response['code'] == 'rateLimited':
                    if attempt == retries - 1:
                        print(f"Rate limited after {retries} attempts for query: {query_string}.")
                        return -1
                    wait_time = 2 ** attempt
                    print(f"Rate limited.  Waiting {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                else:
                    print(f"NewsAPI error for '{query_string}': {response['code']} - {response['message']}")
                    return -1

            except NewsAPIException as e:
                print(f"NewsAPI Exception for '{query_string}': {e}")
                return -1
            except Exception as e:
                print(f"Unexpected error for '{query_string}': {e}")
                return -1
    except Exception as e:
        print(f"Date Calculation Error: {e}")
        return -1

    return -1

def calculate_trend_score(domain_name, article_count):
    """Calculates a trend score."""
    if article_count == -1:
        return 0.0

    score = 0
    tld = domain_name.split('.')[-1].lower()
    if tld == 'com':
        score += 3
    elif tld == '.ai':
        score += 2
    elif tld in ('net', 'org'):
        score += 1

    if '-' in domain_name:
        score -= 1
    if '_' in domain_name:
        score -= 1
    if any(char.isdigit() for char in domain_name):
        score -= 1

    normalized_article_count = min(article_count, 100) / 100.0
    score += normalized_article_count * 5
    return score

In [14]:
# --- DPR Retrieval ---
def load_dpr_components(config):
    """Loads trained DPR model and FAISS index."""
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(config.question_encoder_model)
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(config.context_encoder_model)
    question_encoder = DPRQuestionEncoder.from_pretrained(config.question_encoder_model).to(config.device)
    context_encoder = DPRContextEncoder.from_pretrained(config.context_encoder_model).to(config.device)

    checkpoint = torch.load(config.output_model_path, map_location=config.device)
    question_encoder.load_state_dict(checkpoint['question_encoder'])
    context_encoder.load_state_dict(checkpoint['context_encoder'])
    question_encoder.eval()
    context_encoder.eval()

    index = faiss.read_index(config.faiss_index_path)
    return question_encoder, context_encoder, index, question_tokenizer, context_tokenizer

def retrieve_comparable_sales(domain_name, keywords, question_encoder, context_encoder, index, context_tokenizer, user_category):
    """Retrieves comparable sales using DPR, filtering by category."""
    with torch.no_grad():
        dpr_input = domain_name + " " + " ".join(keywords)
        query_input = question_tokenizer(dpr_input, return_tensors="pt", truncation=True, padding="max_length", max_length=config.max_length).to(config.device)
        query_embedding = question_encoder(**query_input).pooler_output.cpu().numpy()

    k = 20  # Retrieve more initially, then filter
    distances, indices = index.search(query_embedding, k)

    comparable_sales = []
    for i in indices[0]:
        domain, price, category = get_domain_data_from_index(i)  # Get category
        if category == user_category:  # Filter by category!
            comparable_sales.append((domain, price))
            if len(comparable_sales) >= 5:  # Limit to 5 *after* filtering
                break

    return comparable_sales

# --- MODIFIED: Load DataFrame ONCE ---
_cached_df = None  # Global variable to store the DataFrame

def get_domain_data_from_index(index_value):
    """Retrieves original domain data from the CSV, given a FAISS index."""
    global _cached_df  # Use the global variable
    if _cached_df is None:
        # Load the DataFrame only once
        _cached_df = pd.read_csv(config.csv_path)
        print("DataFrame loaded into memory.")  # Indicate loading
    row = _cached_df.iloc[index_value]
    return row['Domain Name'], row['Price'], row['Category']  # ORIGINAL Price!


# --- Feature Engineering ---
def engineer_features(domain_name, trend_score, comparable_sales, user_category, df_row=None):
    features = {
        'domain_length': len(domain_name),
        'tld': domain_name.split('.')[-1].lower(),
        'trend_score': trend_score,  # Use the passed-in trend_score
        'contains_numbers': any(char.isdigit() for char in domain_name),
        'contains_hyphens': '-' in domain_name,
        'avg_comparable_price': 0.0,
        'max_comparable_price': 0.0,
        'min_comparable_price': 0.0,
        'monthly_searches': 0.0,  # Default values
        'cpc_exact': 0.0,
        'cpc_phrase': 0.0,
        'user_category': user_category,
    }

    if comparable_sales:
        prices = [price for _, price in comparable_sales]
        features['avg_comparable_price'] = sum(prices) / len(prices)
        features['max_comparable_price'] = max(prices)
        features['min_comparable_price'] = min(prices)

    # Add original features from the dataset (if available - during training)
    if df_row is not None:
        # features['monthly_searches'] = df_row.get('Monthly Searches', 0.0) # REMOVED
        pass # Removed

    return features

    # --- FLAN-T5 Explanation Generation ---
def generate_explanation_flan_t5(domain_name, predicted_price, features, comparable_sales):
    """Generates an explanation using FLAN-T5 (Few-Shot Prompt)."""

    # --- Few-Shot Examples (CRUCIAL for good explanations) ---
    examples = """
Example 1:
Domain: bestshoes.com
Predicted Price: $1850.00
Domain Features:
- Length: 12
- TLD: com
- Trend Score: 4.20
- Contains Numbers: False
- Contains Hyphens: False
- Category: Fashion
Comparable Sales:
- shoesite.com: $1700.00
- footwearplace.com: $2000.00
Explanation: bestshoes.com is a short and memorable .com domain, which makes it valuable.  The trend score of 4.20 indicates good interest in related topics.  The price is also consistent with comparable sales of similar domains.

Example 2:
Domain: long-domain-name-123.net
Predicted Price: $120.00
Domain Features:
- Length: 23
- TLD: net
- Trend Score: 0.80
- Contains Numbers: True
- Contains Hyphens: True
- Category: Technology
Comparable Sales:
- No comparable sales found.
Explanation:  The predicted price of $120 for long-domain-name-123.net is low due to several factors. The domain is quite long, contains hyphens and numbers (which are generally undesirable), and has a .net TLD, which is less valuable than .com. The low trend score suggests limited current interest, and no close comparable sales were found.
"""

    prompt = f"""{examples}
Now, explain the predicted price of the domain name '{domain_name}'.

Predicted Price: ${predicted_price:.2f}

Domain Features:
- Length: {features['domain_length']}
- TLD: {features['tld']}
- Trend Score: {features['trend_score']:.2f}
- Contains Numbers: {features['contains_numbers']}
- Contains Hyphens: {features['contains_hyphens']}
- Category: {features['user_category']}

Comparable Sales:
"""
    if comparable_sales:
        for comp_domain, comp_price in comparable_sales:
            prompt += f"- {comp_domain}: ${comp_price:.2f}\n"
    else:
        prompt += "- No comparable sales found.\n"

    prompt += "\nExplanation:"

    # --- Debugging: Print the prompt ---
    print("----- FLAN-T5 PROMPT -----")
    print(prompt)
    print("----- END PROMPT -----")

    # --- Tokenize and Generate ---
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
    with torch.no_grad():
        # Use beam search and adjust parameters
        outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) # Parameters
        # print(f"Output IDs: {outputs}")  # Debug Print
    explanation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return explanation

In [26]:
# --- Prediction Function (XGBoost + FLAN-T5) ---
def predict_price(domain_name, user_category, xgb_model, train_columns, question_encoder, context_encoder, index, context_tokenizer):
    """Predicts the price of a domain name."""
    keywords = extract_keywords(domain_name)
    article_count = get_news_article_count(config.news_api_key, keywords, config.days_past)
    trend_score = calculate_trend_score(domain_name, article_count)
    comparable_sales = retrieve_comparable_sales(domain_name, keywords, question_encoder, context_encoder, index, context_tokenizer, user_category)

    # --- Feature Engineering for Prediction (Corrected) ---
    features = engineer_features(domain_name, trend_score, comparable_sales, user_category)

    # Create a DataFrame for the *single* input domain
    input_df = pd.DataFrame([features])

    # --- One-Hot Encode (Consistent with Training) ---
    input_df = pd.get_dummies(input_df, columns=['tld', 'user_category'], prefix=['tld', 'cat'], dummy_na=False)

    # --- Align Columns (CRITICAL) ---
    # Add missing columns (if any) and set them to 0
    missing_cols = set(train_columns) - set(input_df.columns)
    for c in missing_cols:
        input_df[c] = 0
    # Ensure the order of columns is the same as in training
    input_df = input_df[train_columns]

    # Predict using XGBoost
    predicted_price = xgb_model.predict(input_df)[0]  # Get the single prediction


    # Generate explanation (FLAN-T5) - you'll need to implement this part
    explanation = generate_explanation_flan_t5(domain_name, predicted_price, features, comparable_sales)

    return predicted_price, explanation


# --- Data Loading and Preprocessing for XGBoost Training ---
def load_and_preprocess_data(csv_path):
    """Loads and preprocesses data for XGBoost, including original features."""
    try:
        df = pd.read_csv(csv_path)
        df = df.dropna()  # Handle missing values
        df = df.drop_duplicates()

        # --- Feature Engineering (Include original features) ---
        df['domain_length'] = df['Domain Name'].apply(len)
        df['contains_numbers'] = df['Domain Name'].apply(lambda x: any(char.isdigit() for char in x))
        df['contains_hyphens'] = df['Domain Name'].apply(lambda x: '-' in x)
        df['tld'] = df['Domain Name'].apply(lambda x: x.split('.')[-1].lower())
        # Select features and target (include original features, exclude 'Domain Name')
        features = ['domain_length', 'contains_numbers', 'contains_hyphens',
                    'Monthly Searches', 'CPC (Exact)', 'CPC (Phrase)','tld', 'Category', 'Trend Score', 'Price']  # Include original features


        # Ensure all required columns exist
        for col in features:
            if col not in df.columns:
                print(f"Error: Required column '{col}' not found in CSV.")
                exit(1)  # Or handle more gracefully

        # --- One-Hot Encode 'tld' and 'Category' *BEFORE* separating features and target ---
        df = pd.get_dummies(df, columns=['tld', 'Category'], prefix=['tld', 'cat'], dummy_na=False)

        # Drop domain name
        df = df.drop(columns=['Domain Name'])

        #Separate features and target
        y = df['Price']
        X = df.drop(columns=['Price'])

        return X, y

    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_path}")
        return None, None  # Return None, None on error
    except Exception as e:
        print(f"Error loading/preprocessing data: {e}")
        return None, None  # Return None, None on error


    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_path}")
        return None, None
    except Exception as e:
        print(f"Error loading/preprocessing data: {e}")
        return None, None

    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_path}")
        return None, None
    except Exception as e:
        print(f"Error loading/preprocessing data: {e}")
        return None, None


# --- Model Training (XGBoost) ---
def train_xgboost_model(X, y):
    """Trains an XGBoost model."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared: {r2}")

    return model, X_train.columns

In [27]:
# --- Main Execution ---
if __name__ == "__main__":
    api_key = config.news_api_key
    if not api_key:
        print("Please set your NewsAPI key in config.news_api_key")
        exit(1)

    # --- 1. Load Data and Train XGBoost Model ---
    X, y = load_and_preprocess_data(config.csv_path)  # Load and preprocess
    if X is None or y is None:
        print("Error: Could not load or preprocess data. Exiting.")
        exit(1)

    xgb_model, train_columns = train_xgboost_model(X, y)  # Train XGBoost

    # --- 2. Load FLAN-T5 Model and Tokenizer ---
    tokenizer = T5Tokenizer.from_pretrained(config.flan_t5_model)
    model = T5ForConditionalGeneration.from_pretrained(config.flan_t5_model).to(config.device)
    model.eval()
    print(f"FLAN-T5 model loaded on device: {model.device}")  # CHECK DEVICE
    print(f"Tokenizer vocab size: {tokenizer.vocab_size}")  # CHECK TOKENIZER

    # --- 3. Load DPR Model and FAISS Index ---
    question_encoder, context_encoder, dpr_index, question_tokenizer, context_tokenizer = load_dpr_components(config)

    # --- 4. Get User Input and Make Prediction ---
    domain_name = input("Enter the domain name to appraise: ")
    user_category = input("Enter the category of the domain: ") #Added input


    predicted_price, explanation = predict_price(
        domain_name, xgb_model, question_encoder, context_encoder, dpr_index, context_tokenizer,
        config.news_api_key, tokenizer, model, user_category  # Pass user_category
    )

    print("\n--- Results ---")
    print(explanation)
    print(f"Predicted price: {predicted_price}")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Date: object, Keywords: object, TLD: object, Excludes Hyphens: object, Excludes Numbers: object

In [None]:


importances = xgb_model.feature_importances_
feature_names = train_columns  # Your training data's column names
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")