<a href="https://colab.research.google.com/github/bhuron/bhuron.github.io/blob/main/Scripts/JanitorAI/janitor_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title <b> Steps on how to scrap Janitor AI cards. </b>
#@markdown 1. Run the code given by clicking on the run button given on left side. <img src="https://i.ibb.co/zWwJzqQK/image.png" width=30> <br>
#@markdown 2. Wait for around 30 seconds and scroll all the way to the bottom until you find a link like this. <br>
#@markdown <img src="https://i.ibb.co/zWCjmq48/image.png"> <br>
#@markdown 3. Copy the link that ends with `trycloudflare.com`. <br>
#@markdown 4. Open [Janitor AI](https://janitorai.com/) in another tab and select your desired proxy enabled character and start chat with it.<br>
#@markdown 5. Select "proxy" among the options. <br>
#@markdown 6. Select custom in model and keep other settings like this. <br>
#@markdown <img src="https://raw.githubusercontent.com/ashuotaku/sillytavern/main/Images/janitorAI/janitor-proxy.png"> <br>
#@markdown 7. In the url box, paste the `trycloudflare` url that you copied before and add this in the end `/v1/chat/completions`. <br>
#@markdown 8. Add `mock-model-1` in model name and `custom-key` in API key field box. <br>
#@markdown 9. Scroll down and click on save settings. <br>
#@markdown 10. Refresh the chat page. <br>
#@markdown 11. Now send any message like "hi". <br>
#@markdown 12. After sending message click on this file icon given on left side of screen. <br>
#@markdown <img src="https://i.ibb.co/605NPT0s/image.png" alt="image" /> <br>
#@markdown 13. After that click on the three dot beside the file that appears like your-char_chara_card.json. <br>
#@markdown <img src="https://i.ibb.co/5WLwtY5N/image.png"> <br>
#@markdown 14. Now, click on the `Download` button. <br>
#@markdown <img src="https://i.ibb.co/1JpMyKrK/image.png"> <br>
#@markdown 15. Now, you can simply import that json file in your SillyTavern through character card import button and enjoy. <br>
#@markdown 16. After doing all this, don't forget to close your google colab runtime by clicking on `Runtime` in the above menu. <br>
#@markdown <img src="https://i.ibb.co/274Rsp7S/image.png"> <br>
#@markdown 17. Then, click on `Disconnect and delete runtime`. <br>
#@markdown <img src="https://i.ibb.co/MD0jvcpr/image.png">

!pip install flask flask_cors flask_cloudflared
import os
import re
import json
from datetime import datetime, timezone
from flask import Flask, request, jsonify, abort
from flask_cors import CORS
from flask_cloudflared import run_with_cloudflared

try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = "/content"

PORT = 5000
API_KEY = "custom-key"

app = Flask(__name__)
run_with_cloudflared(app)
CORS(app)

def extract_all_characters(messages):
    """
    From the list of message dicts, keep only system‐role entries,
    strip out any <system>…</system> blocks, then find all <TagName>…</TagName> blocks.
    If a tag’s inner content contains “Name:”, “Age:”, “Personality:”, “Character Details”,
    or is longer than 200 characters, treat that tag name as a “character.”
    Also picks up Name("Character Name") patterns.
    Returns a list of candidate character names.
    """
    characters = []
    system_messages = [m for m in messages if m.get("role") == "system"]
    tag_pattern = re.compile(r"<([^>]+)>([\s\S]*?)<\/\1>", re.DOTALL)
    name_quotes_pattern = re.compile(r'Name\s*\(\s*"([^"]+)"\s*\)')

    for msg in system_messages:
        content = msg.get("content", "")
        # Remove nested <system>…</system> so tags inside don’t get matched
        content_without_system = re.sub(
            r"<system>[\s\S]*?<\/system>", "", content, flags=re.DOTALL
        )

        for match in tag_pattern.finditer(content_without_system):
            tag_name = match.group(1).strip()
            if tag_name.lower() in {
                "system", "scenario", "example_dialogs", "roleplay_guidelines", "/"
            }:
                continue
            tag_contents = match.group(2)
            if (
                "Name:" in tag_contents
                or "Age:" in tag_contents
                or "Personality:" in tag_contents
                or "Character Details" in tag_contents
                or len(tag_contents) > 200
            ):
                characters.append(tag_name)

        for nm in name_quotes_pattern.findall(content_without_system):
            characters.append(nm.strip())

    return characters


def normalize_name(s):
    """
    Lowercase, remove non‐word characters (incl. emojis), collapse whitespace.
    """
    no_special = re.sub(r"[^\w\s]", "", s)
    return re.sub(r"\s+", " ", no_special).strip().lower()


def detect_user_character(messages, all_characters):
    """
    Walk backward through the messages. If a user‐role message starts with "Name:",
    attempt to match that Name against all_characters (normalized). If matched, return it.
    Otherwise return the raw prefix. If no “X:” prefix found, return None.
    """
    for msg in reversed(messages):
        if msg.get("role") != "user":
            continue
        content = msg.get("content", "")
        if not content:
            continue
        prefix_match = re.match(r"^([^:]+):", content)
        if prefix_match:
            potential = prefix_match.group(1).strip()
            normalized_pot = normalize_name(potential)
            for char in all_characters:
                normalized_char = normalize_name(char)
                if (
                    normalized_pot == normalized_char
                    or normalized_pot in normalized_char
                    or normalized_char in normalized_pot
                ):
                    return char
            return potential
    return None

def determine_ai_character(all_characters, user_character):
    """
    If there's only one character, return it. If user_character is known,
    pick another character from the list whose name doesn’t match. Otherwise,
    pick the first tag name longer than 2 chars.
    """
    if not all_characters:
        return "unknown"
    if len(all_characters) == 1:
        return all_characters[0]
    if user_character:
        norm_user = normalize_name(user_character)
        for char in all_characters:
            norm_char = normalize_name(char)
            if (
                norm_user != norm_char
                and norm_char not in norm_user
                and norm_user not in norm_char
            ):
                return char
    for char in all_characters:
        if len(char) > 2:
            return char
    return all_characters[0]

def extract_tag_content(messages, tag_name):
    """
    Finds the first <tag_name>…</tag_name> in any system message,
    returns its inner text (with "\\n" → actual newline), or "" if not found.
    """
    pattern = re.compile(
        rf"<{re.escape(tag_name)}>([\s\S]*?)<\/{re.escape(tag_name)}>",
        re.DOTALL
    )
    for msg in messages:
        if msg.get("role") != "system":
            continue
        raw = msg.get("content", "")
        match = pattern.search(raw)
        if match:
            inner = match.group(1).replace(r"\n", "\n").strip()
            return inner
    return ""

def anonymize_for_card(text, char_name, user_name):
    """
    Replace every occurrence of char_name (case‐insensitive) with {{char}},
    and user_name → {{user}}. Uses re.IGNORECASE so any casing
    is replaced. Does not leave blanks.
    """
    if not text:
        return ""

    try:
        pattern_char = re.compile(re.escape(char_name), re.IGNORECASE)
        text = pattern_char.sub("{{char}}", text)
    except re.error:
        text = text.replace(char_name, "{{char}}")

    if user_name:
        try:
            pattern_user = re.compile(re.escape(user_name), re.IGNORECASE)
            text = pattern_user.sub("{{user}}", text)
        except re.error:
            text = text.replace(user_name, "{{user}}")

    return text

def sanitize_filename(name):
    """
    Lowercase, remove non‐alphanumeric, collapse spaces → underscores.
    """
    no_special = re.sub(r"[^\w\s]", "", name)
    collapsed = re.sub(r"\s+", "_", no_special.strip().lower())
    return re.sub(r"_+", "_", collapsed)

def generate_character_card(messages, character_name, user_name):
    # 1) Description
    raw_description = extract_tag_content(messages, character_name)
    description = anonymize_for_card(raw_description, character_name, user_name)

    # 2) Scenario
    raw_scenario = extract_tag_content(messages, "scenario")
    scenario = anonymize_for_card(raw_scenario, character_name, user_name)

    # 3) Example messages
    raw_examples = extract_tag_content(messages, "example_dialogs")
    mes_example = anonymize_for_card(raw_examples, character_name, user_name)

    # Remove leading “Example conversations between {{char}} and {{user}}:” line if it’s there
    prefix = "Example conversations between {{char}} and {{user}}:"
    if mes_example.strip().startswith(prefix):
        following = mes_example.splitlines()[1:]
        mes_example = "\n".join(following).lstrip("\n ")

    # 4) First assistant message
    raw_first_mes = ""
    for msg in messages:
        if msg.get("role") == "assistant" and msg.get("content"):
            raw_first_mes = msg["content"].replace(r"\n", "\n")
            break
    first_mes = anonymize_for_card(raw_first_mes, character_name, user_name)

    # 5) Timestamp
    create_date = datetime.now(timezone.utc).isoformat()

    card = {
        "name": character_name,
        "description": description,
        "personality": "",
        "scenario": scenario,
        "first_mes": first_mes,
        "mes_example": mes_example,
        "creatorcomment": (
            "Exported using JanitorAI scrapper created by ashuotaku, "
            "support my work by buying me a coffee: https://github.com/ashuotaku/"
        ),
        "avatar": "none",
        "talkativeness": "0.5",
        "fav": False,
        "tags": ["JanitorAI", "ashuotaku"],
        "spec": "chara_card_v3",
        "spec_version": "3.0",
        "data": {
            "name": character_name,
            "description": description,
            "personality": "",
            "scenario": scenario,
            "first_mes": first_mes,
            "mes_example": mes_example,
            "creator_notes": (
                "Exported using JanitorAI scrapper created by ashuotaku, "
                "support my work by buying me a [coffee](https://buymeacoffee.com/ashuotaku) and visit my github: [https://github.com/ashuotaku/](https://github.com/ashuotaku/)."
            ),
            "system_prompt": "",
            "post_history_instructions": "",
            "tags": ["JanitorAI", "ashuotaku"],
            "creator": "ashuotaku",
            "character_version": "",
            "alternate_greetings": [],
            "extensions": {
                "talkativeness": "0.5",
                "fav": False,
                "world": "",
                "depth_prompt": {
                    "prompt": "",
                    "depth": 4,
                    "role": "system"
                }
            },
            "group_only_greetings": []
        },
        "create_date": create_date
    }

    # Write JSON next to app.py
    safe_char = sanitize_filename(character_name) or "unknown"
    filename = os.path.join(BASE_DIR, f"{safe_char}_chara_card.json")
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(card, f, indent=4, ensure_ascii=False)

    return filename, card

@app.errorhandler(400)
@app.errorhandler(401)
@app.errorhandler(500)
def handle_error(e):
    try:
        payload = json.loads(e.description)
    except Exception:
        payload = {
            "error": {
                "message": getattr(e, "description", "An error occurred"),
                "type": "server_error"
            }
        }
    status_code = e.code if hasattr(e, "code") else 500
    return jsonify(payload), status_code

@app.route("/v1/models", methods=["GET"])
def list_models():
    response = {
        "object": "list",
        "data": [
            {
                "id": "mock-model-1",
                "object": "model",
                "created": int(datetime.now(timezone.utc).timestamp() * 1000),
                "owned_by": "ashuotaku"
            }
        ]
    }
    return jsonify(response)


@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
    body = request.get_json(force=True)
    messages = body.get("messages")
    print(messages)
    if not isinstance(messages, list):
        abort(400, description=json.dumps({
            "error": {
                "message": "Messages array is required",
                "type": "invalid_request_error"
            }
        }))

    # 1) Extract all characters
    all_characters = extract_all_characters(messages)

    # 2) Detect user character (if any)
    user_character = detect_user_character(messages, all_characters)

    # 3) Determine AI character
    ai_character = determine_ai_character(all_characters, user_character)

    # 4) Build a mock response)
    mock_response = {
        "id": "mock-id",
        "object": "chat.completion",
        "created": int(datetime.now(timezone.utc).timestamp()),
        "model": "mock-model-1",
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": "This is a mock response from the custom OpenAI-compatible server."
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": 0,
            "completion_tokens": 0,
            "total_tokens": 0
        }
    }

    generate_character_card(messages, ai_character, user_character)
    return jsonify(mock_response)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=PORT)