# Fine tune a Base Model (5 steps)


In [1]:
# Install necessary libraries
# LoRA fine-tuning requires PEFT, Transformers, Datasets, and Accelerate
# bitsandbytes is for 4-bit quantization (QLoRA)
!pip install -q -U transformers datasets peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!nvidia-smi

Wed Aug 27 18:00:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:

# A small 3B model for this example.
# We load it in 4-bit to save memory (QLoRA)
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

In [None]:
from huggingface_hub import login
# Hugging Face login
hf_token = "hf_iGWjRtjobCydZbVOTLOXXXXXXXX"  # Replace with your HF token
login(hf_token)


In [None]:
# w and b settins
import os
# Configure WANDB
os.environ["WANDB_API_KEY"] = "6f8cc13db6ea6485b46be0edda82256dXXXX"
import wandb
wandb.init(project="my-finetuning", name=f"qlora-{MODEL_NAME.split('/')[-1]}")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mbasharat-hussain[0m ([33mbasharat-hussain-78[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:

# Import the required libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os


In [7]:
# 1. Load the Model and Tokenizer

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Qwen tokenizer needs a pad token set explicitly
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for batched inference

Loading model and tokenizer...


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
# 2. Load and Preprocess the Dataset
# We'll use a small subset of the databricks-dolly-15k dataset.
print("Loading dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Select a small number of samples for a quick demonstration
subset_size = 500
dataset = dataset.select(range(subset_size))

# Function to format the data into an instruction-following prompt
def format_prompt(sample):
    if sample["instruction"] and sample["context"]:
        # Format for instruction with context
        prompt = f"### Instruction:\n{sample['instruction']}\n\n### Context:\n{sample['context']}\n\n### Response:\n{sample['response']}{tokenizer.eos_token}"
    else:
        # Format for instruction without context
        prompt = f"### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['response']}{tokenizer.eos_token}"
    return {"text": prompt}

# Apply the formatting and tokenize the dataset
dataset = dataset.map(format_prompt, remove_columns=list(dataset.features.keys()))

def tokenize_function(examples):
    # This is the corrected part
    tokenized_output = tokenizer(examples["text"], truncation=True, padding=True, max_length=512)
    # For causal language modeling, the labels are the same as the input_ids
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Loading dataset...


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
# 3. Configure and Prepare LoRA
# Prepare the model for k-bit training and apply LoRA
print("Preparing model for LoRA fine-tuning...")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16, # Rank of the update matrices
    lora_alpha=32, # A scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Layers to apply LoRA to
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA-enabled model trainable parameters:")
model.print_trainable_parameters()


Preparing model for LoRA fine-tuning...
LoRA-enabled model trainable parameters:
trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815


In [10]:
# 4. Train the Model
output_dir = "qwen1.5-1.5b-lora-dolly-finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    fp16=True, # Use FP16 for faster training on GPU
    logging_steps=10,
    save_strategy="epoch",
)

# Initialize the Trainer
print("Starting training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.2944
20,0.6803
30,0.893
40,0.5834
50,0.5144
60,0.8174
70,0.6369
80,0.6788
90,0.7472
100,0.5913


TrainOutput(global_step=125, training_loss=0.8663704280853272, metrics={'train_runtime': 234.8866, 'train_samples_per_second': 2.129, 'train_steps_per_second': 0.532, 'total_flos': 2019377283072000.0, 'train_loss': 0.8663704280853272, 'epoch': 1.0})

In [11]:

# 5. Save the Fine-Tuned Model Locally
# The trainer automatically saves the LoRA adapter weights
print(f"Training complete. Saving model to {output_dir}")
trainer.save_model(output_dir)

print("Fine-tuning successful! The LoRA weights are saved in the specified directory.")

Training complete. Saving model to qwen1.5-1.5b-lora-dolly-finetuned
Fine-tuning successful! The LoRA weights are saved in the specified directory.


In [18]:
# 6. Save the Fine-Tuned Model Locally ->>> to Huggaing Face

from huggingface_hub import notebook_login

notebook_login()



from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# The directory where you saved your LoRA adapters
local_adapter_dir = "qwen1.5-1.5b-lora-dolly-finetuned"
# The name of your new repository on the Hugging Face Hub
hub_repo_id = "YOUR_USERNAME/your-qwen-lora-model"
hub_repo_id = "Basharat78/qwen1.5-1.5b-lora-dolly-finetuned"
# Replace YOUR_USERNAME with your actual Hugging Face username

# Push the adapter and tokenizer files to the Hub
# The push_to_hub() method handles the creation of the repo and file upload automatically
try:
    finetuned_model.push_to_hub(hub_repo_id)
    tokenizer.push_to_hub(hub_repo_id)
    print(f"Model and tokenizer successfully uploaded to {hub_repo_id}")
except Exception as e:
    print(f"An error occurred during upload: {e}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model and tokenizer successfully uploaded to Basharat78/qwen1.5-1.5b-lora-dolly-finetuned


# Evaluate the locally saved and Finetuned Model (3 steps)

In [19]:
# 1. Load the base model

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Load the base model in 4-bit, just like during training
model_id = "Qwen/Qwen2-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Change padding side for inference


In [20]:
# 2. Load local finetuned adapter

# Path to your saved LoRA adapter weights
adapter_path = "qwen1.5-1.5b-lora-dolly-finetuned"

print(f"Loading LoRA adapter from {adapter_path}...")
model = PeftModel.from_pretrained(base_model, adapter_path)

# You can optionally merge the adapter weights into the base model
# This is useful for saving the final model for deployment and can improve inference speed.
# model = model.merge_and_unload()

Loading LoRA adapter from qwen1.5-1.5b-lora-dolly-finetuned...


In [21]:
# 3. Test the prompt

# Test the model with a sample prompt
prompt = "### Instruction:\nWhat are the benefits of a plant-based diet?\n\n### Response:"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

print("Generating response...")
# Generate a response from the model
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the full generated text
print("\n--- Generated Output ---")
print(generated_text)

Generating response...

--- Generated Output ---
### Instruction:
What are the benefits of a plant-based diet?

### Response: A plant-based diet is beneficial for several reasons. It can help you lose weight, improve your health and reduce the risk of disease. Research has shown that a vegetarian or vegan diet may lower your blood pressure and cholesterol levels. Additionally, it may also lower your risk of type 2 diabetes, heart attack, stroke and some types of cancer.
Additionally, a plant-based diet can be easier to maintain over time because there are many more food options available than if you were eating meat. There are numerous resources available online, such as apps and websites, that can help you get started with a plant-based diet.
Another benefit is that it is healthier overall. A plant-based diet is typically higher in fiber and protein, which means it will keep you feeling full longer. It's also low in saturated fats and high in omega-3 fatty acids, which can lower infla

# Compare the Base and (Locally) Finetuned models side-by-side

In [16]:
# ###
# Side-by-Side Comparison Code
# This script will:

# Load the original base model (the same one you started with).

# Load your fine-tuned model (which has the LoRA adapter).

# Use the exact same prompt for both models.

# Print the output from each model clearly labeled, so you can see the difference.

###


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Define the model ID and adapter path
model_id = "Qwen/Qwen2-1.5B-Instruct"
adapter_path = "qwen1.5-1.5b-lora-dolly-finetuned"

# Configure 4-bit quantization for consistent loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 1. Load the Base Model
# This is the original model without any fine-tuning.
print("Loading the original base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 2. Load the Fine-Tuned Model (Base Model + LoRA)
# This model has your fine-tuned weights applied.
print(f"Loading the fine-tuned model with LoRA adapter from {adapter_path}...")
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

# 3. Define the Prompt
# Use the exact same prompt for both models to ensure a fair comparison.
prompt = "### Instruction:\nWhat are the benefits of a plant-based diet?\n\n### Response:"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# 4. Generate Output from Both Models
print("\n--- Generating output from the ORIGINAL base model ---")
with torch.no_grad():
    base_outputs = base_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
base_text = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
print(base_text)

print("\n--- Generating output from the FINE-TUNED model ---")
with torch.no_grad():
    finetuned_outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
finetuned_text = tokenizer.decode(finetuned_outputs[0], skip_special_tokens=True)
print(finetuned_text)

Loading the original base model...
Loading the fine-tuned model with LoRA adapter from qwen1.5-1.5b-lora-dolly-finetuned...

--- Generating output from the ORIGINAL base model ---
### Instruction:
What are the benefits of a plant-based diet?

### Response: The main benefit of a plant-based diet is that it provides all the necessary nutrients, vitamins and minerals. It also provides energy and reduces weight and body fat. There are many other advantages to a plant-based diet as well such as being better for your health and environment.

Plant-based diets can be beneficial because they are lower in saturated fats and cholesterol, which are associated with heart disease and stroke. Plant-based diets are also lower in calories than their meat-based counterparts, so they can help you lose weight or maintain a healthy weight. Additionally, plant-based diets are often more environmentally sustainable because they require less land and water resources to produce food compared to meat-based die

# Load model from HF and test it
###  Loading the Model from Anywhere

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Define the full repository ID for your uploaded model
hub_repo_id = "Basharat78/qwen1.5-1.5b-lora-dolly-finetuned"

# Define the original base model's ID
base_model_id = "Qwen/Qwen2-1.5B-Instruct"

# 1. Load the base model with quantization
print("Loading the original base model...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 2. Load the fine-tuned LoRA adapter from the Hugging Face Hub
# The from_pretrained() method from PEFT will automatically download your adapter files
print(f"Loading LoRA adapter from {hub_repo_id}...")
finetuned_model = PeftModel.from_pretrained(base_model, hub_repo_id)

# 3. Use the model for inference with the same prompt
prompt = "### Instruction:\nWhat are the benefits of a plant-based diet?\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

print("Generating response from the loaded Hub model...")
with torch.no_grad():
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n--- Generated Output from Hugging Face Hub Model ---")
print(generated_text)

Loading the original base model...
Loading LoRA adapter from Basharat78/qwen1.5-1.5b-lora-dolly-finetuned...


adapter_config.json:   0%|          | 0.00/887 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Generating response from the loaded Hub model...

--- Generated Output from Hugging Face Hub Model ---
### Instruction:
What are the benefits of a plant-based diet?

### Response: A plant-based diet is rich in nutrients, especially those that support bone health and immune function. It also contains fewer calories than diets containing animal proteins. Plant foods can also help prevent type 2 diabetes by lowering cholesterol levels and increasing insulin sensitivity.

A plant-based diet also supports a healthy weight loss, as well as a lower risk of obesity-related diseases such as heart disease, stroke, and certain cancers. It also helps to reduce inflammation, which is a key factor in many chronic diseases including cardiovascular disease, cancer, and dementia.
A plant-based diet also helps reduce greenhouse gas emissions associated with livestock production and land use, making it an environmentally-friendly option for reducing climate change impacts.


#Fast API Building Process

1. Install Libraries
First, you'll need to install all the necessary libraries for both FastAPI and your Hugging Face model.

In [22]:
!pip install fastapi "uvicorn[standard]" transformers peft accelerate bitsandbytes
!pip install pyngrok

Collecting httptools>=0.6.3 (from uvicorn[standard])
  Downloading httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting uvloop>=0.15.1 (from uvicorn[standard])
  Downloading uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting watchfiles>=0.13 (from uvicorn[standard])
  Downloading watchfiles-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (510 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.8/510.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownl

2. Create and Run the FastAPI App


In [28]:
import uvicorn

import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Define the base model and adapter paths
BASE_MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
ADAPTER_MODEL_PATH = "/content/qwen1.5-1.5b-lora-dolly-finetuned"

# FastAPI setup
app = FastAPI(title="Qwen2 LoRA Inference API", version="1.0")

# Pydantic model for request validation
class PromptRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 100

# Load the model and tokenizer
# Use 4-bit quantization for memory efficiency, as used in your notebook
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model with quantization
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

    # Load the LoRA adapter weights
    model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_PATH)

except Exception as e:
    raise RuntimeError(f"Failed to load model: {e}")

@app.get("/")
def home():
    return {"message": "Qwen2 LoRA Inference API is running!"}

@app.post("/generate")
async def generate_text(request: PromptRequest):
    try:
        # Prepare the input using chat template
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": request.prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # Tokenize the prompt and move to the correct device
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # Generate the response
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=request.max_new_tokens,
            do_sample=True,
            temperature=0.8,
            top_p=0.9
        )

        # Decode the generated output
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # The notebook shows how to decode the output.
        # The notebook also provides a sample prompt and shows the expected response format.
        return {"response": generated_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

3. Expose the Server with ngrok
In a new notebook cell, set up and run ngrok. This will provide a public URL for your API.

In [29]:
from pyngrok import ngrok
import os
import nest_asyncio

# Apply asyncio patch
nest_asyncio.apply()

# Set your ngrok authentication token
# You can get one for free from https://dashboard.ngrok.com/get-started/your-authtoken
# It's a good practice to store this in Colab Secrets
NGROK_AUTH_TOKEN = "31sh1MF6lHFdyKZzAHZ9dDnqALa_2pbsx8aNYxAJYLZeEwdeU"  # or use os.environ['NGROK_AUTH_TOKEN'] if using secrets
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open a tunnel to your FastAPI port
public_url = ngrok.connect(8000).public_url
print(f"FastAPI app is live at: {public_url}")

# This will block the cell and run the uvicorn server
uvicorn.run(app, host="0.0.0.0", port=8000)

FastAPI app is live at: https://9be177c0b3a7.ngrok-free.app


INFO:     Started server process [1108]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     35.227.85.127:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.227.85.127:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.227.85.127:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.227.85.127:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.227.85.127:0 - "POST /generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1108]
