In [1]:
#1. Connect Google Drive
from google.colab import drive
drive.mount('/content/drive')

#2. Install necessary libraries
!pip install flask transformers peft accelerate bitsandbytes flask-cors  pyngrok torch torchvision torchaudio --upgrade
print("ok")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ok


In [None]:
import torch


from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import logging
from pyngrok import ngrok
import time
import os

# --- 1. Setup Logging and Flask App ---
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

app = Flask(__name__)
CORS(app, resources={r"/chat": {"origins": "*"}})

# --- 2. Global Model Variables ---
model = None
tokenizer = None

LORA_PATH = "/content/drive/MyDrive/final_qwen_model"
BASE_MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

NGROK_AUTH_TOKEN = "35mhbclDI3iZwDKGHtXgO70xS6N_4TLJYLv9ZFLFQADukKXCV"


# --- 3. Model Loading Function ---
def load_model():
    global model, tokenizer
    try:
        if not os.path.exists(LORA_PATH) or not os.listdir(LORA_PATH):
            print(f"Error: LORA path does not exist or is empty: {LORA_PATH}")
            print("Please verify the Google Drive path.")
            return

        print(f"Loading Base Model: {BASE_MODEL_NAME}")

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_NAME,
            device_map="auto",
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16
        )
        base_model.eval()

        print(f"Loading LORA Adapter from: {LORA_PATH}")
        model = PeftModel.from_pretrained(base_model, LORA_PATH)

        print(f"Loading Tokenizer from: {LORA_PATH}")
        tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)
        tokenizer.pad_token = tokenizer.eos_token

        print("Model and Tokenizer loaded successfully")

    except Exception as e:
        print(f"Failed to load model or tokenizer: {e}")
        import traceback
        traceback.print_exc()
        model = None
        tokenizer = None


# --- 4. Chat/Inference Function ---
@torch.no_grad()
def generate_response(question):
    if not model or not tokenizer:
        return "Error: Model not loaded."

    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    if "Answer:" in text:
        return text.split("Answer:")[-1].strip()

    return text.strip()


# --- 5. API Endpoint ---
@app.route('/chat', methods=['POST'])
def chat_endpoint():
    data = request.get_json()
    question = data.get('prompt', '')

    if not question:
        return jsonify({"error": "No question provided"}), 400

    print(f"Received question: {question}")

    response_text = generate_response(question)

    print(f"Sending response: {response_text[:50]}...")

    return jsonify({"response": response_text})


# --- 6. Server Initialization ---
if __name__ == '__main__':
    load_model()

    if model:
        try:
            ngrok.set_auth_token(NGROK_AUTH_TOKEN)
            print("ngrok authentication successful.")

            print("Starting ngrok tunnel...")
            time.sleep(1)

            public_url = ngrok.connect(5000)

            print("-----------------------------------------------------")
            print(f"Public URL: {public_url}/chat")
            print("Copy this URL into your frontend index.html file.")
            print("-----------------------------------------------------")

            app.run(host='0.0.0.0', port=5000, use_reloader=False)

        except Exception as e:
            print(f"Failed to start ngrok or Flask: {e}")
            ngrok.kill()

    else:
        print("Server cannot start because the model failed to load.")

Loading Base Model: Qwen/Qwen2.5-3B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Loading LORA Adapter from: /content/drive/MyDrive/final_qwen_model
Loading Tokenizer from: /content/drive/MyDrive/final_qwen_model
Model and Tokenizer loaded successfully
ngrok authentication successful.
Starting ngrok tunnel...
-----------------------------------------------------
Public URL: NgrokTunnel: "https://favourable-subhyoid-russel.ngrok-free.dev" -> "http://localhost:5000"/chat
Copy this URL into your frontend index.html file.
-----------------------------------------------------
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [22/Nov/2025 19:10:51] "OPTIONS /chat HTTP/1.1" 200 -


Received question: hi


INFO:werkzeug:127.0.0.1 - - [22/Nov/2025 19:10:56] "POST /chat HTTP/1.1" 200 -


Sending response: Assisted the customer with a technical issue relat...
