In [1]:
# --- Install Required Libraries ---

# Install core libraries for data manipulation, machine learning, and transformer models.
# The 'transformers[torch]' syntax ensures PyTorch compatibility.
# The '-q' flag is used for a "quiet" installation, reducing verbose output.
!pip install -q pandas numpy scikit-learn xgboost transformers[torch] datasets

# Install libraries for efficient model loading and vector search.
# We force an upgrade ('-U') on bitsandbytes to ensure the latest version is used,
# which is required for 4-bit quantization of the open-source model.
!pip install -q -U bitsandbytes
!pip install -q accelerate faiss-cpu sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# --- Step 1: Force a clean installation of the LATEST compatible libraries ---
# This is the definitive fix to resolve all version conflicts.

print("Uninstalling existing libraries to ensure a clean environment...")
!pip uninstall -y transformers accelerate trl peft bitsandbytes

print("\nInstalling the latest stable versions of all required Hugging Face libraries...")
!pip install -q \
    transformers \
    trl \
    peft \
    accelerate \
    bitsandbytes

print("\n✅ Clean installation complete. Please RESTART the session now.")

Uninstalling existing libraries to ensure a clean environment...
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: accelerate 1.10.1
Uninstalling accelerate-1.10.1:
  Successfully uninstalled accelerate-1.10.1
[0mFound existing installation: peft 0.17.1
Uninstalling peft-0.17.1:
  Successfully uninstalled peft-0.17.1
Found existing installation: bitsandbytes 0.48.1
Uninstalling bitsandbytes-0.48.1:
  Successfully uninstalled bitsandbytes-0.48.1

Installing the latest stable versions of all required Hugging Face libraries...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m


In [1]:
# --- Import All Necessary Libraries ---

# Standard libraries for data handling, OS interaction, and memory management.
import os
import gc
import re
import json
import joblib

# Core data science and machine learning libraries.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# PyTorch and Hugging Face Transformers libraries.
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Libraries for the RAG (Retrieval-Augmented Generation) pipeline.
import faiss
from sentence_transformers import SentenceTransformer

# Library for Google Drive integration.
from google.colab import drive

# --- Mount Google Drive and Configure Paths ---

# Mount the user's Google Drive to the Colab environment to access datasets and save artifacts.
drive.mount('/content/drive')

# Define the root path for the project within Google Drive.
# This directory should contain the 'decentraFAQ.csv' file and an 'artifacts' subfolder.
PROJECT_PATH = "/content/drive/MyDrive/DecentraBot_Project/"
ARTIFACTS_PATH = os.path.join(PROJECT_PATH, "artifacts/")

# Create the artifacts directory if it does not already exist. This ensures a safe location for saving models.
os.makedirs(ARTIFACTS_PATH, exist_ok=True)

print("Setup complete. All libraries imported and Google Drive is mounted.")
print(f"Trained models and data samples will be saved to: {ARTIFACTS_PATH}")

Mounted at /content/drive
Setup complete. All libraries imported and Google Drive is mounted.
Trained models and data samples will be saved to: /content/drive/MyDrive/DecentraBot_Project/artifacts/


In [3]:
# --- 1. Load NFT Transaction Data Sample ---
# We load a small subset of the NFT-70M dataset using streaming mode.
# This avoids downloading the entire multi-gigabyte dataset, making the process fast and memory-efficient.
print("Loading a 20,000-row sample from the NFT-70M dataset...")
try:
    nft_dataset = load_dataset("MLNTeam-Unical/NFT-70M_transactions", split='train', streaming=True).take(20000)
    df_nft_sample = pd.DataFrame(nft_dataset)

    # Standardize the timestamp column for consistency across all data sources.
    # 'errors=coerce' will turn any unparseable timestamps into NaT (Not a Time), which are then dropped.
    df_nft_sample['timestamp'] = pd.to_datetime(df_nft_sample['tx_timestamp'], format='mixed', errors='coerce')
    df_nft_sample.dropna(subset=['timestamp'], inplace=True)
    print(f"Successfully loaded a sample of {len(df_nft_sample)} transactions.")
except Exception as e:
    print(f"Error loading NFT dataset: {e}")


# --- 2. Load the DecentraFAQ Intent Classification Dataset ---
# This dataset is used to train the intent classifier in a later step.
print("Loading the DecentraFAQ dataset...")
try:
    # The full path to the CSV is constructed using the PROJECT_PATH defined in the setup cell.
    df_faq = pd.read_csv(os.path.join(PROJECT_PATH, "DecentraFAQ.csv"))
    # For this demonstration, a smaller sample is used to enable faster model fine-tuning.
    df_faq_sample = df_faq.sample(n=1000, random_state=42).copy()
    print(f"Successfully loaded and sampled {len(df_faq_sample)} queries from DecentraFAQ.")
except FileNotFoundError:
    print("Error: decentraFAQ.csv not found. Please ensure the file is in your Google Drive project folder.")

# --- 3. Save Data Samples for Reproducibility ---
# Saving the exact data subsets used ensures that any researcher can run this notebook
# and get the same results, which is critical for reproducibility.
if 'df_nft_sample' in locals():
    df_nft_sample.to_csv(os.path.join(ARTIFACTS_PATH, "nft_sample_data.csv"), index=False)
    print(f"Saved nft_sample_data.csv to the artifacts folder.")
if 'df_faq_sample' in locals():
    df_faq_sample.to_csv(os.path.join(ARTIFACTS_PATH, "faq_sample_data.csv"), index=False)
    print(f"Saved faq_sample_data.csv to the artifacts folder.")

Loading a 20,000-row sample from the NFT-70M dataset...
Successfully loaded a sample of 20000 transactions.
Loading the DecentraFAQ dataset...
Successfully loaded and sampled 1000 queries from DecentraFAQ.
Saved nft_sample_data.csv to the artifacts folder.
Saved faq_sample_data.csv to the artifacts folder.


In [4]:
# --- Section: Data Augmentation ---
# As described in the paper, we augment the raw transaction data with external
# features to provide richer context for the predictive models.

import yfinance as yf
from datetime import datetime

print("Augmenting transaction data with economic indicators...")

# Ensure the dataframe has a date column for merging
df_nft_sample['date'] = df_nft_sample['timestamp'].dt.date

# --- Fetch economic data for a broad, fixed date range ---
# This ensures we have data to match, regardless of where the sample comes from.
print("Fetching economic data for a broad period (2018-Today)...")
start_date_fixed = "2018-01-01"
end_date_fixed = datetime.today().strftime('%Y-%m-%d')

try:
    tickers = yf.download(
        ['BTC-USD', 'ETH-USD'], # Bitcoin and Ethereum prices
        start=start_date_fixed,
        end=end_date_fixed,
        progress=False
    )

    # --- KEY FIX: Add a check to ensure data was downloaded successfully ---
    if tickers.empty:
        print("Error: yfinance returned no data. Skipping augmentation.")
    else:
        # Select the 'Close' column for the price data.
        df_economic = tickers['Close'].copy()

        # Rename the columns to our desired format.
        df_economic.rename(columns={
            'BTC-USD': 'btc_price',
            'ETH-USD': 'eth_price'
        }, inplace=True)
        df_economic.reset_index(inplace=True)

        df_economic['date'] = df_economic['Date'].dt.date

        # --- Merge the broad economic data with our specific transaction sample ---
        df_nft_sample = pd.merge(df_nft_sample, df_economic[['date', 'btc_price', 'eth_price']], on='date', how='left')

        # Forward-fill missing values for weekends/holidays using the modern .ffill() method.
        df_nft_sample[['btc_price', 'eth_price']] = df_nft_sample[['btc_price', 'eth_price']].ffill()

        # Drop any rows where a match was still not possible (e.g., very old transactions).
        df_nft_sample.dropna(subset=['btc_price', 'eth_price'], inplace=True)

        print(f"\nSuccessfully added economic indicators. New features: {['btc_price', 'eth_price']}")
        print("Preview of the augmented data:")
        print(df_nft_sample[['timestamp', 'price', 'btc_price', 'eth_price']].head())

except Exception as e:
    print(f"An error occurred while fetching or processing economic data: {e}")

Augmenting transaction data with economic indicators...
Fetching economic data for a broad period (2018-Today)...


  tickers = yf.download(



Successfully added economic indicators. New features: ['btc_price', 'eth_price']
Preview of the augmented data:
            timestamp  price     btc_price    eth_price
0 2023-04-29 22:59:54  0.030  29248.488281  1908.916992
1 2023-04-29 22:59:48  0.470  29248.488281  1908.916992
2 2023-04-29 22:59:47  0.035  29248.488281  1908.916992
3 2023-04-29 22:59:47  0.035  29248.488281  1908.916992
4 2023-04-29 22:59:47  0.035  29248.488281  1908.916992


In [5]:
import random

# --- Section: Data Preparation for Multitask-TabLLM ---
# As a core contribution, the paper proposes a Multitask-TabLLM fine-tuned on tabular data
# that has been serialized into natural language instructions. This section implements that process.

print("Preparing multitask instruction dataset for TabLLM fine-tuning...")

# --- 1. Define Professional, Varied Prompt Templates ---
# Based on the paper's methodology, we use a set of templates to convert structured
# data rows into diverse, natural-language questions for two tasks.

regression_templates = [
    "Given the following attributes for an NFT transaction: `num_sales` is {num_sales}, `price` is {price:.4f}, `btc_price` is {btc_price:,.2f}, and `eth_price` is {eth_price:,.2f}. What is the predicted `usd_price`? Respond with only the numerical value.",
    "Predict the `usd_price` for an NFT transaction with these details: num_sales: {num_sales}, price: {price:.4f}, btc_price: {btc_price:,.2f}, eth_price: {eth_price:,.2f}. Provide only the number.",
    "An NFT sale has the following data points: price is {price:.4f}, the number of sales is {num_sales}, btc price is {btc_price:,.2f}, and eth price is {eth_price:,.2f}. Calculate its estimated `usd_price`."
]

classification_templates = [
    "An NFT transaction has the following characteristics: `num_sales` is {num_sales}, `price` is {price:.4f}, `btc_price` is {btc_price:,.2f}, and `eth_price` is {eth_price:,.2f}. Would you classify its `usd_price` as 'High' or 'Low'? Respond with only the class label.",
    "Based on the data (num_sales: {num_sales}, price: {price:.4f}, btc_price: {btc_price:,.2f}, eth_price: {eth_price:,.2f}), is the transaction price 'High' or 'Low'? Give only the label.",
    "Classify the following transaction as having a 'High' or 'Low' `usd_price`. The data is: num_sales of {num_sales}, a price of {price:.4f}, a btc_price of {btc_price:,.2f}, and an eth_price of {eth_price:,.2f}."
]

# --- 2. Define Tasks and Serialize Data ---
# Create the classification target ('High' or 'Low' based on the median price).
median_price = df_nft_sample['usd_price'].median()
df_nft_sample['price_class'] = np.where(df_nft_sample['usd_price'] >= median_price, 'High', 'Low')

# Define the features to be used for the TabLLM.
tabllm_features = ['num_sales', 'price', 'btc_price', 'eth_price']

multitask_instructions = []
# Create 500 instructions for each task (1000 total) for a quick fine-tuning run.
for i in range(500):
    # Get a random sample row.
    sample_row = df_nft_sample.sample(n=1, random_state=i).iloc[0]

    # --- Create a Regression Instruction using a random template ---
    regression_target = sample_row['usd_price']
    chosen_reg_template = random.choice(regression_templates)
    regression_instruction = chosen_reg_template.format(**sample_row)

    # Format for the Mistral Instruct template.
    multitask_instructions.append({
        "text": f"[INST] {regression_instruction} [/INST] {regression_target:.4f}"
    })

    # --- Create a Classification Instruction using a random template ---
    classification_target = sample_row['price_class']
    chosen_cls_template = random.choice(classification_templates)
    classification_instruction = chosen_cls_template.format(**sample_row)

    multitask_instructions.append({
        "text": f"[INST] {classification_instruction} [/INST] {classification_target}"
    })

print(f"Successfully synthesized {len(multitask_instructions)} instructions for two tasks.")
print("\n--- Example Regression Instruction ---")
print(multitask_instructions[0]['text'])
print("\n--- Example Classification Instruction ---")
print(multitask_instructions[1]['text'])

Preparing multitask instruction dataset for TabLLM fine-tuning...
Successfully synthesized 1000 instructions for two tasks.

--- Example Regression Instruction ---
[INST] An NFT sale has the following data points: price is 0.0016, the number of sales is 1, btc price is 29,248.49, and eth price is 1,908.92. Calculate its estimated `usd_price`. [/INST] 0.4958

--- Example Classification Instruction ---
[INST] Based on the data (num_sales: 1, price: 0.0016, btc_price: 29,248.49, eth_price: 1,908.92), is the transaction price 'High' or 'Low'? Give only the label. [/INST] Low


In [6]:
# --- Section: Augmenting TabLLM Data with Forecasting Instructions ---
import random
print("Augmenting multitask dataset with Time-Series Forecasting instructions...")

# --- 1. Define Professional Forecasting Templates ---
# These templates are designed to present a sequence of historical data to the LLM.
forecasting_templates = [
    "Given the transaction history for the past {sequence_length} days: {historical_data}. What is the predicted `usd_price` for the next transaction? Respond with only the numerical value.",
    "Analyze the following sequence of {sequence_length} transactions: {historical_data}. Forecast the `usd_price` for the subsequent sale. Provide only the number.",
    "Based on this time-series data: {historical_data}, what is the expected `usd_price` of the next NFT sale in the sequence?"
]

# --- 2. Prepare and Serialize Sequential Data ---
# We need to sort our data by time to create meaningful sequences.
df_nft_sample_sorted = df_nft_sample.sort_values('timestamp').reset_index(drop=True)
sequence_length = 5  # We'll use a short sequence of 5 past transactions for the prompt.

# Create 500 forecasting instructions.
forecasting_instructions_count = 0
for i in range(500):
    # Find a random valid starting point for a sequence.
    # We need at least 'sequence_length' rows before it and 1 row after it.
    if len(df_nft_sample_sorted) > sequence_length:
        start_index = random.randint(0, len(df_nft_sample_sorted) - sequence_length - 2)

        # Extract the historical sequence and the target value.
        sequence_df = df_nft_sample_sorted.iloc[start_index : start_index + sequence_length]
        target_row = df_nft_sample_sorted.iloc[start_index + sequence_length]

        # Serialize the historical data into a readable string.
        historical_data_str = ""
        for _, row in sequence_df.iterrows():
            historical_data_str += (f"[Timestamp: {row['timestamp'].strftime('%Y-%m-%d')}, "
                                    f"USD Price: {row['usd_price']:.2f}, "
                                    f"BTC Price: {row['btc_price']:,.0f}], ")

        # Choose a random template.
        chosen_template = random.choice(forecasting_templates)
        forecasting_instruction = chosen_template.format(
            sequence_length=sequence_length,
            historical_data=historical_data_str.rstrip(', ')
        )

        # Format for the Mistral Instruct template.
        multitask_instructions.append({
            "text": f"[INST] {forecasting_instruction} [/INST] {target_row['usd_price']:.4f}"
        })
        forecasting_instructions_count += 1

print(f"Successfully synthesized {forecasting_instructions_count} new forecasting instructions.")
print(f"Total instructions for TabLLM fine-tuning now: {len(multitask_instructions)}")

if forecasting_instructions_count > 0:
    print("\n--- Example Forecasting Instruction ---")
    # Print the last added instruction as an example
    print(multitask_instructions[-1]['text'])

Augmenting multitask dataset with Time-Series Forecasting instructions...
Successfully synthesized 500 new forecasting instructions.
Total instructions for TabLLM fine-tuning now: 1500

--- Example Forecasting Instruction ---
[INST] Based on this time-series data: [Timestamp: 2023-04-29, USD Price: 38.83, BTC Price: 29,248], [Timestamp: 2023-04-29, USD Price: 26.03, BTC Price: 29,248], [Timestamp: 2023-04-29, USD Price: 3.73, BTC Price: 29,248], [Timestamp: 2023-04-29, USD Price: 14.93, BTC Price: 29,248], [Timestamp: 2023-04-29, USD Price: 15.77, BTC Price: 29,248], what is the expected `usd_price` of the next NFT sale in the sequence? [/INST] 15.3507


In [7]:
# Install the Transformer Reinforcement Learning (TRL) library
# This library provides the SFTTrainer, which is a specialized tool for
# supervised fine-tuning of language models on instruction-based datasets.
!pip install trl -q

In [12]:
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from transformers import BitsAndBytesConfig
import os, gc, torch

# 0) Assumes: multitask_instructions is a list of dicts with a "text" field
ft_dataset_tabllm = Dataset.from_list(multitask_instructions)

ARTIFACTS_PATH = ARTIFACTS_PATH  # make sure this exists
tabllm_model_path = os.path.join(ARTIFACTS_PATH, "multitask_tabllm_adapter")

if os.path.isdir(tabllm_model_path):
    print("\nFound a previously fine-tuned model adapter. Skipping retraining.")
else:
    print("\nNo fine-tuned model adapter found. Proceeding with training.")

    # 1) LoRA / PEFT
    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # 2) 4-bit quantization (QLoRA)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

    # 3) SFT config — put model_init_kwargs HERE
    training_args = SFTConfig(
        output_dir=os.path.join(ARTIFACTS_PATH, "tabllm_results"),
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,                   # typical for LoRA
        logging_steps=10,
        max_steps=100,
        save_strategy="steps",
        save_steps=50,
        report_to="none",
        fp16=True,

        # SFT-specific
        dataset_text_field="text",            # your dataset must have this column
        max_length=512,
        packing=True,

        # >>> THIS is the correct place for from_pretrained kwargs <<<
        model_init_kwargs={
            "quantization_config": quantization_config,
            "torch_dtype": torch.float16,
            "device_map": "auto",            # place shards automatically
        },
    )

    model_id = "mistralai/Mistral-7B-Instruct-v0.2"

    trainer_tabllm = SFTTrainer(
        model=model_id,
        args=training_args,
        peft_config=peft_config,
        train_dataset=ft_dataset_tabllm,
    )

    print("\nStarting lightweight fine-tuning for Multitask-TabLLM with PEFT/LoRA...")
    trainer_tabllm.train()
    print("TabLLM fine-tuning complete.")

    # Save only the adapter (Peft model)
    trainer_tabllm.save_model(tabllm_model_path)
    print(f"Fine-tuned TabLLM adapter saved to: {tabllm_model_path}")

    # Clean up
    del trainer_tabllm
    gc.collect()
    torch.cuda.empty_cache()
    print("Cleaned up fine-tuning resources.")



No fine-tuned model adapter found. Proceeding with training.


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Adding EOS to train dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]


Starting lightweight fine-tuning for Multitask-TabLLM with PEFT/LoRA...


  return fn(*args, **kwargs)


Step,Training Loss
10,2.0336
20,1.0268
30,0.5075
40,0.3332
50,0.3177
60,0.2951
70,0.3009
80,0.2929
90,0.291
100,0.3032


  return fn(*args, **kwargs)


TabLLM fine-tuning complete.
Fine-tuned TabLLM adapter saved to: /content/drive/MyDrive/DecentraBot_Project/artifacts/multitask_tabllm_adapter
Cleaned up fine-tuning resources.


In [14]:
import faiss
from sentence_transformers import SentenceTransformer

# 1. Synthesize a Knowledge Base
knowledge_base_texts = [
    "NFT prices are highly volatile and influenced by market hype. Key metrics to watch are floor price, trading volume, and unique holders.",
    "Investing in NFTs involves significant risk. Always do your own research (DYOR) before purchasing any digital asset. Never invest more than you can afford to lose.",
    "The MANA token is the primary currency used for transactions within the Decentraland ecosystem. Its price can affect the USD value of land sales.",
    "'Floor price' refers to the lowest price for an NFT within a specific collection. A rising floor price can indicate increasing demand."
]

# 2. Embed and Index the Knowledge Base
print("Building RAG pipeline...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(knowledge_base_texts)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# 3. Create the retrieval function
def retrieve_context(query, k=2):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(query_embedding, k)
    return [knowledge_base_texts[i] for i in indices[0]]

print(" RAG pipeline is ready.")

# --- Test the RAG pipeline ---
test_query = "What affects the value of an NFT?"
retrieved_docs = retrieve_context(test_query)
print(f"\nTest query: '{test_query}'\nRetrieved context: {retrieved_docs}")

Building RAG pipeline...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 RAG pipeline is ready.

Test query: 'What affects the value of an NFT?'
Retrieved context: ['NFT prices are highly volatile and influenced by market hype. Key metrics to watch are floor price, trading volume, and unique holders.', 'Investing in NFTs involves significant risk. Always do your own research (DYOR) before purchasing any digital asset. Never invest more than you can afford to lose.']


In [22]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

print("Fine-tuning Query Classifier (DistilBERT)...")

# 1. Prepare the FAQ Data for Training
df_faq_sample.rename(columns={'query': 'text'}, inplace=True)
df_faq_sample = df_faq_sample[['text', 'label']].dropna()
label_encoder = LabelEncoder()
df_faq_sample['label_id'] = label_encoder.fit_transform(df_faq_sample['label'])
id_to_label = {i: str(label) for i, label in enumerate(label_encoder.classes_)}
label_to_id = {label: i for i, label in id_to_label.items()}
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_faq_sample['text'].tolist(), df_faq_sample['label_id'].tolist(), test_size=0.2, random_state=42
)

# 2. Tokenize the Text Data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
class FAQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)
train_dataset = FAQDataset(train_encodings, train_labels)
val_dataset = FAQDataset(val_encodings, val_labels)

# 3. Configure and Execute the Fine-Tuning Process
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(id_to_label), id2label=id_to_label, label2id=label_to_id
)
training_args = TrainingArguments(
    output_dir=os.path.join(ARTIFACTS_PATH, 'results'),
    num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=16,
    logging_steps=10, eval_strategy="epoch", save_strategy="epoch",
    load_best_model_at_end=True, report_to="none"
)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)

print("Starting DistilBERT fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

# 4. Save the Final Model for Later Use
model_path = os.path.join(ARTIFACTS_PATH, 'query_classifier_model')
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Classifier model saved to the artifacts folder.")

Fine-tuning Query Classifier (DistilBERT)...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting DistilBERT fine-tuning...


Epoch,Training Loss,Validation Loss
1,1.5921,1.443376
2,1.1387,1.055513
3,0.9056,0.954006


Fine-tuning complete.
Classifier model saved to the artifacts folder.


In [2]:
from peft import PeftModel
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          AutoModelForSequenceClassification, BitsAndBytesConfig, pipeline)
import torch
import joblib
from sentence_transformers import SentenceTransformer
import faiss

print("--- Initializing Full DecentraBot Pipeline ---")

# --- 1. Re-initialize the RAG Pipeline ---
# This ensures the RAG components are always available.
print("\n1. Initializing RAG pipeline...")
knowledge_base_texts = [
    "NFT prices exhibit high volatility and are significantly influenced by market sentiment and hype cycles. Important metrics include floor price, trading volume, and the number of unique asset holders.",
    "Investing in Non-Fungible Tokens (NFTs) carries substantial risk due to market fluctuations and potential illiquidity. Always conduct thorough research (DYOR) before any purchase and adhere to responsible investment principles.",
    "Different NFT collections utilize various blockchain tokens for transactions (e.g., ETH, MATIC, MANA). The price of the underlying token can impact the USD valuation of NFT sales.",
    "The 'floor price' denotes the minimum listed price for any NFT within a given collection. An increasing floor price often suggests growing demand or decreasing supply at lower price points.",
]
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(knowledge_base_texts)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
def retrieve_context(query, k=2):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(query_embedding, k)
    return [knowledge_base_texts[i] for i in indices[0]]
print("   -> RAG pipeline is ready.")

# --- 2. Load the Main Conversational LLM (Base Mistral) ---
# This is the model used for the WNP diagnosis and final response generation.
print("\n2. Loading main conversational LLM (Mistral-7B)...")
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
local_llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")
print("   -> Main conversational LLM is ready.")

# --- 3. Load the Fine-Tuned Query Classifier ---
# This is the fine-tuned DistilBERT model for intent classification.
print("\n3. Loading fine-tuned Query Classifier...")
try:
    classifier_model_path = os.path.join(ARTIFACTS_PATH, 'query_classifier_model')
    classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_path)
    classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_path)
    query_classifier = pipeline("text-classification", model=classifier_model, tokenizer=classifier_tokenizer)
    print("   -> Query Classifier loaded successfully.")
except Exception as e:
    print(f"   -> ERROR loading Query Classifier: {e}")

# --- 4. Load the Fine-Tuned Multitask-TabLLM ---
# This is the Mistral model with the TabLLM adapter weights merged into it.
print("\n4. Loading fine-tuned Multitask-TabLLM...")
try:
    # We need a separate tokenizer instance for this pipeline.
    tabllm_tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tabllm_tokenizer.pad_token is None:
        tabllm_tokenizer.pad_token = tabllm_tokenizer.eos_token

    # Reload the base model to apply the TabLLM adapter.
    base_model_for_tabllm = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
    adapter_path = os.path.join(ARTIFACTS_PATH, "multitask_tabllm_adapter")
    tabllm_model = PeftModel.from_pretrained(base_model_for_tabllm, adapter_path)
    tabllm_model = tabllm_model.merge_and_unload()

    tabllm_pipeline = pipeline("text-generation", model=tabllm_model, tokenizer=tabllm_tokenizer, torch_dtype=torch.float16, device_map="auto")
    print("   -> Fine-tuned TabLLM prediction pipeline is ready.")
except Exception as e:
    print(f"   -> ERROR loading TabLLM: {e}")

print("\n--- ✅ All Components Initialized ---")

--- Initializing Full DecentraBot Pipeline ---

1. Initializing RAG pipeline...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   -> RAG pipeline is ready.

2. Loading main conversational LLM (Mistral-7B)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


   -> Main conversational LLM is ready.

3. Loading fine-tuned Query Classifier...


Device set to use cuda:0


   -> Query Classifier loaded successfully.

4. Loading fine-tuned Multitask-TabLLM...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


   -> Fine-tuned TabLLM prediction pipeline is ready.

--- ✅ All Components Initialized ---


In [3]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, BitsAndBytesConfig, pipeline
import torch
import joblib

print("Loading all trained artifacts for the final pipeline...")

# --- 1. Load the Query Classifier ---
# This is the fine-tuned DistilBERT model for intent classification.
# We load the model and tokenizer objects first for a more robust pipeline.
try:
    classifier_model_path = os.path.join(ARTIFACTS_PATH, 'query_classifier_model')
    classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_path)
    classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_path)
    query_classifier = pipeline("text-classification", model=classifier_model, tokenizer=classifier_tokenizer)
    print("✅ Query Classifier loaded successfully.")
except Exception as e:
    print(f"❌ ERROR: Could not load the Query Classifier. Details: {e}")

# --- 2. Load the Fine-Tuned Multitask-TabLLM ---
# We load the base Mistral model again and apply the fine-tuned adapter weights on top.
print("\nLoading the fine-tuned Multitask-TabLLM...")
try:
    quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
    model_id = "mistralai/Mistral-7B-Instruct-v0.2"

    # We need a separate tokenizer instance for this pipeline
    tabllm_tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tabllm_tokenizer.pad_token is None:
        tabllm_tokenizer.pad_token = tabllm_tokenizer.eos_token

    base_model_for_tabllm = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto"
    )

    # Load the PEFT adapter weights from the saved directory
    adapter_path = os.path.join(ARTIFACTS_PATH, "multitask_tabllm_adapter")
    tabllm_model = PeftModel.from_pretrained(base_model_for_tabllm, adapter_path)
    # Merge the adapter into the base model for faster inference
    tabllm_model = tabllm_model.merge_and_unload()
    print("✅ Fine-tuned TabLLM loaded and merged successfully.")

    # Create a dedicated pipeline for the TabLLM
    tabllm_pipeline = pipeline("text-generation", model=tabllm_model, tokenizer=tabllm_tokenizer, torch_dtype=torch.float16, device_map="auto")
    print("✅ TabLLM prediction pipeline is ready.")

except Exception as e:
    print(f"❌ ERROR: Could not load the TabLLM. Details: {e}")


# --- 3. Confirm RAG and Main Conversational LLM are Ready ---
# The main LLM for conversation (the base Mistral model) should still be in memory.
try:
    retrieve_context # RAG function
    local_llm_pipeline # Main conversational pipeline
    print("\n✅ RAG and Main Conversational LLM pipelines are ready.")
except NameError:
    print("❌ ERROR: Main LLM or RAG pipeline not found. Please re-run the cells where they are created.")

Device set to use cuda:0


Loading all trained artifacts for the final pipeline...
✅ Query Classifier loaded successfully.

Loading the fine-tuned Multitask-TabLLM...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


✅ Fine-tuned TabLLM loaded and merged successfully.
✅ TabLLM prediction pipeline is ready.

✅ RAG and Main Conversational LLM pipelines are ready.


In [5]:
import re
import gc
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, BitsAndBytesConfig, pipeline
import torch
import joblib
from sentence_transformers import SentenceTransformer
import faiss

print("--- Initializing Full DecentraBot Pipeline ---")

# --- 1. Load All Trained and Pre-Trained Components ---
# (This section reloads all necessary components to ensure the cell is self-contained)

# A) RAG Pipeline Components
print("\n1. Initializing RAG pipeline...")
knowledge_base_texts = [
    "NFT prices exhibit high volatility and are significantly influenced by market sentiment and hype cycles. Important metrics include floor price, trading volume, and the number of unique asset holders.",
    "Investing in Non-Fungible Tokens (NFTs) carries substantial risk due to market fluctuations and potential illiquidity. Always conduct thorough research (DYOR) before any purchase and adhere to responsible investment principles.",
    "The MANA token is the primary currency for Decentraland. Its price can affect the USD value of land sales.",
    "'Floor price' denotes the minimum listed price for any NFT within a given collection."
]
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(knowledge_base_texts)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
def retrieve_context(query, k=2):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(query_embedding, k)
    return [knowledge_base_texts[i] for i in indices[0]]
print("   -> RAG pipeline is ready.")

# B) Main Conversational LLM (Base Mistral)
print("\n2. Loading main conversational LLM (Mistral-7B)...")
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
local_llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")
print("   -> Main conversational LLM is ready.")

# C) Fine-Tuned Query Classifier (DistilBERT)
print("\n3. Loading fine-tuned Query Classifier...")
try:
    classifier_model_path = os.path.join(ARTIFACTS_PATH, 'query_classifier_model')
    classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_path)
    classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_path)
    query_classifier = pipeline("text-classification", model=classifier_model, tokenizer=classifier_tokenizer)
    print("   -> Query Classifier loaded successfully.")
except Exception as e:
    print(f"   -> ERROR loading Query Classifier: {e}")

# D) Fine-Tuned Multitask-TabLLM
print("\n4. Loading fine-tuned Multitask-TabLLM...")
try:
    tabllm_tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tabllm_tokenizer.pad_token is None:
        tabllm_tokenizer.pad_token = tabllm_tokenizer.eos_token
    base_model_for_tabllm = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
    adapter_path = os.path.join(ARTIFACTS_PATH, "multitask_tabllm_adapter")
    tabllm_model = PeftModel.from_pretrained(base_model_for_tabllm, adapter_path)
    tabllm_model = tabllm_model.merge_and_unload()
    tabllm_pipeline = pipeline("text-generation", model=tabllm_model, tokenizer=tabllm_tokenizer, torch_dtype=torch.float16, device_map="auto")
    print("   -> Fine-tuned TabLLM prediction pipeline is ready.")
except Exception as e:
    print(f"   -> ERROR loading TabLLM: {e}")

print("\n--- ✅ All Components Initialized ---")


# --- 2. Define Professional Prompt Templates ---
# (The WNP Diagnosis prompt remains the same)
wnp_diagnosis_prompt = """
[INST]
As a System Architect...
... (Full prompt from before)
[/INST]
"""
generation_prompt_template = """
[INST]
You are DecentraBot, a world-class AI investment analyst...
... (Full prompt from before)
[/INST]
"""
# Re-pasting the full prompts for completeness
wnp_diagnosis_prompt = """
[INST]
As a System Architect specializing in Human-Computer Interaction, your task is to analyze the user's query and determine the optimal `Response Strategy`. You must return only a single, valid JSON object containing your analysis.
**User Query:** "{user_query}"
**JSON Schema for your output:** {{ "reasoning": "A brief justification for your chosen strategy.", "strategy": "Choose one: 'Reformulate_Full', 'Suggest_Refinement', or 'Direct_Answer'." }}
**Strategy Definitions:**
- "Reformulate_Full": For vague, low-skill queries.
- "Suggest_Refinement": For good queries missing one key detail.
- "Direct_Answer": For specific, well-formed queries.
Analyze the query and provide your response in the specified JSON format.
[/INST]
"""
generation_prompt_template = """
[INST]
You are DecentraBot, a world-class AI investment analyst for the NFT and metaverse markets. Your response must be objective, data-driven, and adhere to all directives.
**CORE DIRECTIVES:**
1.  **Never give direct financial advice.** Frame insights as informational and educational.
2.  **Always use a Chain of Thought:** First, explain your step-by-step reasoning based on the provided context. Then, provide a final, conclusive answer.
3.  **Strictly adhere to the specific `STRATEGY_INSTRUCTIONS`** provided below.
**CONTEXTUAL DATA:**
- User's Query: {user_query}
- Your Analysis: The user's intent is '{intent}'.
- Knowledge Base: {rag_context}
- Predictive Insights: {predicted_price_context}
**STRATEGY_INSTRUCTIONS:**
{strategy_instructions}
Begin your response.
[/INST]
"""


# --- 3. Define the Final Pipeline Functions ---

# --- ❗️ KEY FIX 1: More robust TabLLM prediction function ---
def get_tabllm_prediction(features_dict):
    """Formats a more robust regression prompt and gets a price prediction."""
    # This improved prompt guides the model to a structured output.
    prompt_template = """
    As a data analyst, your task is to perform a regression prediction.
    Analyze the following data: `num_sales` is {num_sales}, `price` is {price:.4f}, `btc_price` is {btc_price:,.2f}, and `eth_price` is {eth_price:,.2f}.
    Your goal is to predict the `usd_price`.

    Think step-by-step:
    1. Review the input features.
    2. Estimate the `usd_price` based on these features.
    3. Provide your final answer in the format: "Final Prediction: [NUMBER]".

    Do not provide any other explanation.
    """
    prompt = f"[INST] {prompt_template.format(**features_dict)} [/INST]"
    outputs = tabllm_pipeline(prompt, max_new_tokens=50, do_sample=False, pad_token_id=tabllm_tokenizer.eos_token_id)
    prediction_str = outputs[0]['generated_text'].split('[/INST]')[-1].strip()

    # This more robust parsing looks for the specific format first.
    match = re.search(r"Final Prediction:.*?([-+]?\d*\.?\d+)", prediction_str)
    if match:
        return float(match.group(1))
    else: # Fallback to finding any number
        fallback_match = re.search(r"[-+]?\d*\.?\d+", prediction_str)
        if fallback_match:
            return float(fallback_match.group(0))
    return None

def get_decentrabot_response(user_query):
    """Orchestrates the full pipeline with all components."""
    print(f"\n{'='*20} Processing New Query: '{user_query}' {'='*20}")
    intent = query_classifier(user_query)[0]['label']
    print(f"1. Detected Intent: {intent}")
    rag_context = retrieve_context(user_query)
    print(f"2. Retrieved RAG Context.")
    predicted_price_context = "Prediction is not applicable for this query."
    if intent in ['Prediction', 'Valuation']:
        sample_features = {'num_sales': 5, 'price': 0.5, 'btc_price': 60000.00, 'eth_price': 3000.00}
        print("3. Getting prediction from fine-tuned TabLLM...")
        predicted_price = get_tabllm_prediction(sample_features)
        if predicted_price is not None:
            predicted_price_context = f"Based on a similar asset profile, the fine-tuned TabLLM predicts a price of approximately ${predicted_price:.2f}."
            print(f"   -> TabLLM Prediction: ${predicted_price:.2f}")
        else:
            predicted_price_context = "The TabLLM could not generate a valid price prediction."
            print("   -> TabLLM prediction failed.")

    print("\n--- WNP Phase 1 & 2: Running Query Diagnosis ---")
    prompt1 = wnp_diagnosis_prompt.format(user_query=user_query)
    outputs1 = local_llm_pipeline(prompt1, max_new_tokens=256, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    try:
        json_str = outputs1[0]['generated_text'].split('[/INST]')[-1].strip()
        match = re.search(r'\{.*\}', json_str, re.DOTALL)
        diagnosis = json.loads(match.group(0))
        strategy = diagnosis.get('strategy', 'Direct_Answer')
        print(f"WNP Diagnosis Complete. Selected Strategy: '{strategy}'")
    except (json.JSONDecodeError, AttributeError, IndexError):
        strategy = 'Direct_Answer'
        print("Warning: Could not parse JSON. Defaulting to 'Direct_Answer'.")

    # --- ❗️ KEY FIX 2: More explicit instructions for the model ---
    strategy_instructions = ""
    if strategy == 'Reformulate_Full':
        strategy_instructions = "**First,** provide a brief, helpful answer to the user's question based on the general context. **Then,** explain why their query is too broad and use WNP by suggesting a fully reformulated question. Frame it as 'A better question might be:'."
    elif strategy == 'Suggest_Refinement':
        strategy_instructions = "**First,** provide a solid, direct answer based on the context. **Then,** use WNP by suggesting a specific detail the user could add. Frame it as 'To get more specific, you could also ask about...'."
    else: # Direct_Answer
        strategy_instructions = "Provide a comprehensive, direct, and data-driven answer based on the context. Conclude by using WNP to suggest a logical follow-up question."

    final_prompt = generation_prompt_template.format(
        user_query=user_query, intent=intent,
        rag_context='\\n'.join([' - ' + doc for doc in rag_context]),
        predicted_price_context=predicted_price_context,
        strategy_instructions=strategy_instructions
    )
    print("\n--- WNP Phase 3: Constructed Final Strategic Prompt ---")

    print("--- WNP Phase 4: Generating Guided Response... ---")
    outputs2 = local_llm_pipeline(final_prompt, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.95, pad_token_id=tokenizer.eos_token_id)
    final_response = outputs2[0]['generated_text'].split('[/INST]')[-1].strip()
    return final_response

# --- 4. Run Demonstration ---
print("\n--- Starting Final DecentraBot Demonstration ---")
query1 = "Is it a good time to invest?"
response1 = get_decentrabot_response(query1)
print("\n--- FINAL RESPONSE 1 ---")
print(response1)

--- Initializing Full DecentraBot Pipeline ---

1. Initializing RAG pipeline...
   -> RAG pipeline is ready.

2. Loading main conversational LLM (Mistral-7B)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Device set to use cuda:0


   -> Main conversational LLM is ready.

3. Loading fine-tuned Query Classifier...
   -> Query Classifier loaded successfully.

4. Loading fine-tuned Multitask-TabLLM...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


   -> Fine-tuned TabLLM prediction pipeline is ready.

--- ✅ All Components Initialized ---

--- Starting Final DecentraBot Demonstration ---

1. Detected Intent: Valuation
2. Retrieved RAG Context.
3. Getting prediction from fine-tuned TabLLM...
   -> TabLLM Prediction: $2.75

--- WNP Phase 1 & 2: Running Query Diagnosis ---
WNP Diagnosis Complete. Selected Strategy: 'Reformulate_Full'

--- WNP Phase 3: Constructed Final Strategic Prompt ---
--- WNP Phase 4: Generating Guided Response... ---

--- FINAL RESPONSE 1 ---
Investing in Non-Fungible Tokens (NFTs) can be an exciting opportunity, but it's important to note that the market carries significant risk due to price volatility and potential illiquidity. The value of NFTs is influenced by various factors such as market sentiment, hype cycles, floor price, trading volume, and the number of unique asset holders.

However, your question, "Is it a good time to invest?" is quite broad. A better question might be: "Based on the current mark