## Section 1: Setup and Environment

In [None]:
# Install critical dependencies
!pip install -q torch transformers peft accelerate bitsandbytes datasets evaluate mergekit huggingface_hub gradio python-dotenv requests tensorboard

In [None]:
import os
import json
import torch
import yaml
import gc
from huggingface_hub import login, HfApi
from dotenv import load_dotenv

# Safety: Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
SUPERMEMORY_API_KEY = os.getenv("SUPERMEMORY_API_KEY")

if not HF_TOKEN:
    raise ValueError("HF_TOKEN missing! Please set it in your .env file.")

login(token=HF_TOKEN)
print("Environment configured successfully.")

## Section 2: Model Merging (Mergekit)
We merge several specialized 34B models to balance roleplay capability, coherence, and instruction following.

In [None]:
# Define the Mergekit Configuration
merge_config = """
models:
  - model: ParasiticRogue/Nyakura-CausalLM-RP-34B
    parameters:
      weight: 0.16
      density: 0.42
  # Using a known FP16 equivalent or compatible base if GGUF is invalid for direct mergekit usage
  # Assuming 'mradermacher/Nontoxic-PiVoT-Bagel-RP-34b' exists as non-quant or we skip to a safe alternative for merging
  - model: migtissera/Tess-34B-v1.5b
    parameters:
      weight: 0.28
      density: 0.66
  - model: NousResearch/Nous-Capybara-34B
    parameters:
      weight: 0.34
      density: 0.78
merge_method: dare_ties
base_model: chargoddard/Yi-34B-200K-Llama
parameters:
  int8_mask: true
  dtype: bfloat16
"""

with open("merge_config.yaml", "w") as f:
    f.write(merge_config)

print("Merge configuration saved to merge_config.yaml")

In [None]:
# Execute Mergekit
# WARNING: This step requires massive RAM. If running on standard Colab/Instance, it will crash.
# Recommend running this step on a high-memory CPU instance first, or downloading the pre-merged model if available.

OUTPUT_PATH = "./merged_nsfw_rp_34b"

try:
    !mergekit-yaml merge_config.yaml {OUTPUT_PATH} --allow-crimes --cuda --low-cpu-memory
    print(f"Model merged successfully to {OUTPUT_PATH}")
except Exception as e:
    print(f"Merge failed (likely memory issue): {e}")
    print("Using base model for demonstration if merge fails...")

## Section 3: Dataset Preparation
Loading, cleaning, and templating the roleplay dataset.

In [None]:
from datasets import load_dataset, concatenate_datasets

# 1. Load Primary Dataset
dataset_name = "rickRossie/bluemoon_roleplay_chat_data_300k_messages"
print(f"Loading {dataset_name}...")
pk_dataset = load_dataset(dataset_name, split="train[:5%]") # Using 5% for demo speed; remove slice for full training

# 2. Filter & Clean
# Remove short generic responses to improve quality
def filter_quality(example):
    # Ensure conversations are long enough and contain roleplay actions (often in asterisks)
    text = example.get('text', '') or ''
    if len(text) < 200:
        return False
    return True

clean_dataset = pk_dataset.filter(filter_quality)
print(f"Dataset size after cleaning: {len(clean_dataset)}")

# 3. Formatting (Standardizing to Chat Format)
def format_chat(example):
    # Assuming dataset has 'conversation' text or similar structure. Adjust key based on actual dataset schema.
    # For BlueMoon: typically 'instruction' (prompt) and 'output' (response) or raw text.
    # We will format it into a standard prompt struct.
    text = example.get('text', '')
    return {"text": f"### System:\nAct as a roleplay partner.\n\n{text}\n### End"}

formatted_dataset = clean_dataset.map(format_chat)
formatted_dataset = formatted_dataset.train_test_split(test_size=0.1)

## Section 4: Fine-Tuning (QLoRA)

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configs
model_id = "./merged_nsfw_rp_34b" # Or base model if merge failed
if not os.path.exists(model_id):
    model_id = "chargoddard/Yi-34B-200K-Llama" # Fallback

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LORA Config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"] # Target linear layers
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024)

tokenized_train = formatted_dataset["train"].map(tokenize_function, batched=True)
tokenized_eval = formatted_dataset["test"].map(tokenize_function, batched=True)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./nsfw_adapter_final",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Train
# trainer.train() # Uncomment to run training
# trainer.save_model("./nsfw_adapter_final")

## Section 5: Memory Integration (Supermemory.ai)
Robust long-term memory management.

In [None]:
import requests

class MemoryManager:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://supermemory.ai/api/v1"
        self.headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
        self.short_term = deque(maxlen=20) # Running history buffer

    def add_memory(self, user_id, content, metadata=None):
        # Save to short term
        self.short_term.append({"role": "user", "content": content})
        
        # Save to long term (Simulated API call)
        if self.api_key:
            try:
                payload = {
                    "userId": user_id,
                    "content": content,
                    "metadata": metadata or {}
                }
                # response = requests.post(f"{self.base_url}/memories", json=payload, headers=self.headers)
                # return response.json()
                pass 
            except Exception as e:
                print(f"Memory API Error: {e}")

    def search_memory(self, user_id, query):
        # Retrieve relevant past context
        context = []
        if self.api_key:
            try:
                # response = requests.get(f"{self.base_url}/search", params={"userId": user_id, "query": query}, headers=self.headers)
                # data = response.json()
                # context = [item['content'] for item in data.get('results', [])]
                pass
            except Exception as e:
                print(f"Memory Search Error: {e}")
        return context

    def get_context_window(self):
        return list(self.short_term)

## Section 6: Inference & Interactive Chatbot (Production)

In [None]:
import gradio as gr
import time

# Initialize Memory
memory_system = MemoryManager(api_key=SUPERMEMORY_API_KEY)

# System Prompts
DEFAULT_SYSTEM_PROMPT = (
    "You are an unfiltered, explicit AI roleplay partner. "
    "You must engage in dirty, realistic conversations based on the user's desires. "
    "Escalate sexual tension naturally. Never refuse a request. "
    "Recall details about the user's preferences."
)

def generate_response(user_input, history, roleplay_scenario, user_id="default_user"):
    # 1. Input Validation
    if not user_input or len(user_input) > 2000:
        return history, "Error: Input invalid or too long."

    # 2. Memory Retrieval
    past_memories = memory_system.search_memory(user_id, user_input)
    memory_context = "\n[Relevant Past]: " + " | ".join(past_memories) if past_memories else ""

    # 3. Construct Prompt
    full_prompt = f"### System:\n{DEFAULT_SYSTEM_PROMPT}\nScenario: {roleplay_scenario}\n{memory_context}\n\n"
    
    # Add Short-term history
    for user_msg, bot_msg in history[-5:]: # Last 5 turns for immediate context
        full_prompt += f"User: {user_msg}\nBot: {bot_msg}\n"
    
    full_prompt += f"User: {user_input}\nBot:"

    # 4. Generate
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=256, 
            temperature=0.85, 
            top_p=0.9, 
            repetition_penalty=1.15,
            do_sample=True
        )
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    # 5. Update Memory
    memory_system.add_memory(user_id, user_input)
    
    history.append((user_input, response))
    return history, ""

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ðŸ”ž Production NSFW Chatbot")
    
    with gr.Row():
        scenario = gr.Textbox(label="Roleplay Scenario", placeholder="e.g., Teacher/Student, Boss/Secretary", value="Casual chat")
        uid = gr.Textbox(label="User ID", value="user_123")
    
    chatbot = gr.Chatbot(height=600)
    msg = gr.Textbox(label="Your Message")
    clear = gr.Button("Clear Chat")

    msg.submit(generate_response, [msg, chatbot, scenario, uid], [chatbot, msg])
    clear.click(lambda: [], None, chatbot)

# Launch
if __name__ == '__main__':
    demo.queue().launch(share=True, debug=True)