# Chatbot Deployment and Benchmarking Notebook

In [1]:
!pip install transformers rouge_score datasets tensorflow

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[

## Part 1: Starting the Flask App

In [2]:
%%writefile flask_chatbot_app.py

# Import necessary libraries
import time
import psutil
import requests
import tensorflow as tf
from flask import Flask, request, jsonify
from transformers import TFAutoModelForCausalLM, AutoTokenizer, pipeline

# Initialize the chatbot model
model_name = "microsoft/Phi-3-mini-4k-instruct"
chatbot = pipeline("text-generation", model=model_name, trust_remote_code=True)

# Define Flask app
app = Flask(__name__)

@app.route('/chat', methods=['POST'])
def chat():
    user_input = request.json['message']
    start_time = time.time()
    response = chatbot(user_input, max_new_tokens=250, num_return_sequences=1)[0]['generated_text']
    end_time = time.time()

    # Calculate latency
    latency = end_time - start_time

    # Measure RAM usage
    process = psutil.Process()
    ram_usage = process.memory_info().rss / (1024 * 1024)  # Convert to MB

    # Measure GPU RAM usage
    gpu_ram_usage = 0
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        gpu_ram_usage = tf.config.experimental.get_memory_info('GPU:0')['current'] / (1024 * 1024)  # Convert to MB

    return jsonify({
        'response': response,
        'latency': latency,
        'ram_usage': ram_usage,
        'gpu_ram_usage': gpu_ram_usage
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


Writing flask_chatbot_app.py



Run the Flask app using `nohup` to ensure it runs in the background:

In [3]:

import subprocess

# Stop any running Flask app
subprocess.run(['pkill', '-f', 'flask_chatbot_app.py'])

CompletedProcess(args=['pkill', '-f', 'flask_chatbot_app.py'], returncode=1)

In [4]:
!nohup python flask_chatbot_app.py &

nohup: appending output to 'nohup.out'


In [7]:
!sudo lsof -i -P -n | grep LISTEN

node         7 root   21u  IPv6  21238      0t0  TCP *:8080 (LISTEN)
kernel_ma   14 root    3u  IPv4  21220      0t0  TCP 172.28.0.12:6000 (LISTEN)
colab-fil   80 root    3u  IPv4  22870      0t0  TCP 127.0.0.1:3453 (LISTEN)
jupyter-n  136 root    7u  IPv4  24038      0t0  TCP 172.28.0.12:9000 (LISTEN)
python3   1571 root   21u  IPv4  66797      0t0  TCP 127.0.0.1:35329 (LISTEN)
python3   1616 root    3u  IPv4  69972      0t0  TCP 127.0.0.1:44145 (LISTEN)
python3   1616 root    5u  IPv4  69973      0t0  TCP 127.0.0.1:58941 (LISTEN)
pt_main_t 2531 root    4u  IPv4  93722      0t0  TCP *:5000 (LISTEN)


## Part 2: Benchmarking by Making POST Requests

In [8]:

# Save this code as benchmark.py

import time
import requests
from rouge_score import rouge_scorer
from datasets import load_dataset

# Function to calculate ROUGE score
def calculate_rouge_score(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return scores

# Function to benchmark chatbot with OpenOrca dataset by making POST requests
def benchmark_chatbot():
    # Load the OpenOrca dataset from Hugging Face
    open_orca_dataset = load_dataset("Open-Orca/OpenOrca")["train"].shuffle(42).select(range(20))

    total_latency = 0
    total_ram_usage = 0
    total_gpu_ram_usage = 0
    rouge_scores = []

    url = 'http://localhost:5000/chat'

    for example in open_orca_dataset:
        user_input = example['question']
        reference_output = example['response']

        start_time = time.time()
        response = requests.post(url, json={'message': user_input}).json()
        end_time = time.time()

        # Calculate latency
        latency = end_time - start_time
        total_latency += latency

        # Extract RAM and GPU RAM usage from response
        ram_usage = response['ram_usage']
        gpu_ram_usage = response['gpu_ram_usage']
        total_ram_usage += ram_usage
        total_gpu_ram_usage += gpu_ram_usage

        # Get the chatbot response
        chatbot_response = response['response']

        # Calculate ROUGE score
        rouge_score = calculate_rouge_score(reference_output, chatbot_response)
        rouge_scores.append(rouge_score)

    avg_latency = total_latency / len(open_orca_dataset)
    avg_ram_usage = total_ram_usage / len(open_orca_dataset)
    avg_gpu_ram_usage = total_gpu_ram_usage / len(open_orca_dataset)

    print(f"Average Latency: {avg_latency:.2f} seconds")
    print(f"Average RAM Usage: {avg_ram_usage:.2f} MB")
    print(f"Average GPU RAM Usage: {avg_gpu_ram_usage:.2f} MB")
    print(f"ROUGE Scores: {rouge_scores}")

# Run the benchmarking script
benchmark_chatbot()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Average Latency: 43.17 seconds
Average RAM Usage: 16597.17 MB
Average GPU RAM Usage: 0.00 MB
ROUGE Scores: [{'rouge1': Score(precision=0.37799043062200954, recall=0.2958801498127341, fmeasure=0.3319327731092437), 'rouge2': Score(precision=0.11057692307692307, recall=0.08646616541353383, fmeasure=0.0970464135021097), 'rougeL': Score(precision=0.17703349282296652, recall=0.13857677902621723, fmeasure=0.15546218487394958)}, {'rouge1': Score(precision=0.12171052631578948, recall=0.9024390243902439, fmeasure=0.21449275362318843), 'rouge2': Score(precision=0.0627062706270627, recall=0.475, fmeasure=0.11078717201166181), 'rougeL': Score(precision=0.08881578947368421, recall=0.6585365853658537, fmeasure=0.1565217391304348)}, {'rouge1': Score(precision=0.18120805369127516, recall=1.0, fmeasure=0.30681818181818177), 'rouge2': Score(precision=0.1554054054054054, recall=0.8846153846153846, fmeasure=0.26436781609195403), 'rougeL': Score(precision=0.1610738255033557, recall=0.8888888888888888, fmeas