# Response Generation and Comparison (Flan-T5 vs Zephyr)
This notebook takes customer review texts, finds similar examples using FAISS, and generates a short, friendly reply using either Flan-T5-small or Zephyr-7b-beta.

It then compares the responses using human-written references and evaluates them with BLEU, ROUGE-L, and Perplexity.

We use GPT-2 to estimate the **perplexity** of generated responses, which helps assess their fluency and coherence.


### Load LoRA Classification Model + Predictions CSV
We load the fine-tuned classification model (LoRA) and the CSV with predicted labels, extracted from a shared ZIP package.


In [None]:
# Upgrade to the latest version of bitsandbytes for 4-bit quantization support
!pip install -q --upgrade bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import subprocess

# List of all required packages
all_packages = [
    "bitsandbytes",                   # For 4-bit quantization (Zephyr)
    "faiss-cpu",                      # For fast similarity search
    "sentence-transformers",          # For embeddings
    "evaluate"                        # For BLEU, ROUGE, etc.
]

# Unified silent pip install
command = ["pip", "install", "-q"] + all_packages
result = subprocess.run(command, capture_output=True, text=True)

# Optional: final check message
if result.returncode == 0:
    print(" All required packages installed successfully.")
else:
    print(" Installation failed:\n", result.stderr)



 All required packages installed successfully.


In [None]:
# ===========================
# Library Imports – Generation Pipeline
# ===========================

# Standard libraries
import os                     # File/directory operations
import zipfile                # For unzipping the model/data archive
import pandas as pd           # Data manipulation (DataFrames)
import numpy as np            # Numerical operations

# PyTorch
import torch                  # Tensor operations (used by Transformers)

# ===========================
# Hugging Face Transformers
# ===========================

# Tokenizer and model for classification (LoRA)
from peft import PeftModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Flan-T5 model for sequence-to-sequence generation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Zephyr model for causal generation (instruction-tuned model)
from transformers import AutoModelForCausalLM, AutoTokenizer

# GPT-2 model for evaluating perplexity
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# ===========================
# Evaluation Libraries
# ===========================

from evaluate import load     # For BLEU, ROUGE-L, etc.

# ===========================
# FAISS + Embedding Models
# ===========================

import faiss                  # Fast similarity search on embeddings
from sentence_transformers import SentenceTransformer  # To encode review texts


In [None]:
from google.colab import files
uploaded = files.upload()

Saving bert_sentiment_package.zip to bert_sentiment_package.zip


In [None]:
# Unzip the saved package
with zipfile.ZipFile("bert_sentiment_package.zip", 'r') as zip_ref:
    zip_ref.extractall("app")
    print(" Contenu de l'archive ZIP :")
    print(zip_ref.namelist())

 Contenu de l'archive ZIP :
['bert_sentiment_lora/', 'test_with_predictions.csv', 'bert_sentiment_lora/adapter_model.safetensors', 'bert_sentiment_lora/special_tokens_map.json', 'bert_sentiment_lora/tokenizer_config.json', 'bert_sentiment_lora/README.md', 'bert_sentiment_lora/vocab.txt', 'bert_sentiment_lora/adapter_config.json']


In [None]:
# Function to load LoRA fine-tuned model and tokenizer
def load_classification_model():
    base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = PeftModel.from_pretrained(base_model, "app/bert_sentiment_lora")
    tokenizer = AutoTokenizer.from_pretrained("app/bert_sentiment_lora")
    model.eval()
    return tokenizer, model


In [None]:
# # Load model and CSV
# cls_tokenizer, cls_model = load_classification_model()
test_df = pd.read_csv("app/test_with_predictions.csv")
print(" LoRA model and CSV loaded.")
test_df.head(2)

 LoRA model and CSV loaded.


Unnamed: 0,text,label,predicted_label,predicted_sentiment
0,"good to have it was for my new galaxy note 2, ...",1,1,positive
1,"very good. working perfectly so far, fit my ze...",1,1,positive


### Load Flan-T5 model for generation

 We start by loading a lightweight T5 model fine-tuned by Google for general instruction-following tasks.

In [None]:
# Load Flan-T5-small tokenizer and model
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Setup FAISS Index and SentenceTransformer

We encode all the clean_texts  and build a FAISS index for fast nearest neighbor search.


In [None]:
# Load lightweight encoder model
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pd.read_csv("app/test_with_predictions.csv")

# Extract texts to index (use df["clean_text"] or "clean_combined")
texts = df["text"].tolist()

# Encode texts into embeddings (N x 384)
embeddings = encoder.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Create FAISS cosine similarity index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)

print(f"FAISS index created with {index.ntotal} vectors.")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

FAISS index created with 955 vectors.


###  Function to Find Similar Reviews

Given a customer comment, we search the FAISS index to retrieve the top-k most similar reviews.


In [None]:
def search_similar(user_comment, top_k=3):
    # Encode the user comment
    query_embedding = encoder.encode([user_comment], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)

    # Search for top-k most similar reviews
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the matching texts
    similar_texts = [texts[i] for i in indices[0]]

    print(" User comment:")
    print(user_comment)
    print("\n Similar reviews found:\n")
    for i, text in enumerate(similar_texts, 1):
        print(f"{i}. {text}\n")

    return similar_texts


### 💡 Sentiment-Aware Generation

### In this version of the pipeline, we incorporate the **predicted sentiment** of each user comment to help the language model generate more appropriate replies.

### This bridges our two tasks:
 - **Sentiment classification (Notebook 1)** trained using a BERT-based model
- **Response generation (Notebook 2)** using Flan-T5 and Zephyr

### For each comment:
 - We retrieve the predicted sentiment (`positive` or `negative`)
 - We search for similar reviews using FAISS
 - We build a context-enriched prompt with the sentiment explicitly added
 - We generate a tailored reply from each model


###  Prompt Construction Functions

These functions format prompts differently for Flan-T5-small and Zephyr-7b-beta.

They include the user comment and retrieved similar reviews as context.


In [None]:
def build_prompt_with_sentiment(user_comment, sentiment, similar_texts):
    context = "\n".join([f"{i+1}. {text}" for i, text in enumerate(similar_texts)])
    prompt = (
        f"You are a customer support assistant at SanDisk.\n"
        f"The user's sentiment is **{sentiment.upper()}**.\n\n"
        f"Based on their comment and similar reviews, write a short, friendly, and helpful reply.\n"
        f"Tone should match the sentiment: empathetic if negative, encouraging if positive.\n"
        f"Keep the response under 3 sentences.\n\n"
        f"User comment:\n{user_comment}\n\n"
        f"Similar reviews:\n{context}\n\n"
        f"Reply:"
    )
    return prompt
# Build prompt for Zephyr with sentiment
def build_prompt_zephyr_with_sentiment(user_comment, sentiment, similar_reviews):
    context = "\n".join([f"{i+1}. {rev}" for i, rev in enumerate(similar_reviews)])
    prompt = (
        f"You are an Amazon customer service assistant.\n"
        f"The sentiment of the review is **{sentiment.upper()}**.\n"
        f"Write a short and casual reply to the following customer review (max 2 sentences).\n"
        f"Be empathetic if the sentiment is negative, and upbeat if positive.\n\n"
        f"Customer review:\n{user_comment}\n\n"
        f"Similar reviews:\n{context}\n\n"
        f"Reply:"
    )
    return prompt

Flan-T5 Response Generation

In [None]:
# Flan-T5 generation with sentiment-aware prompt
def generate_response_flan(prompt, max_length=150):
    inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True)
    output = flan_model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=1
    )
    return flan_tokenizer.decode(output[0], skip_special_tokens=True)

Load Zephyr Model and Pipeline


In [None]:
model_id = "HuggingFaceH4/zephyr-7b-beta"

zephyr_tokenizer = AutoTokenizer.from_pretrained(model_id)
zephyr_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)



tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation",
    model=zephyr_model,
    tokenizer=zephyr_tokenizer,
    device_map="auto",
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7
)


Device set to use cuda:0


Zephyr Response Generator (with cleaner reply extraction)

In [None]:
def generate_response_zephyr(prompt, max_length=100):
    raw_output = pipe(prompt, max_new_tokens=max_length)[0]["generated_text"]
    reply = raw_output.split("Reply:")[-1].strip()

    # Split at "Or:" or duplicate variants if needed
    reply = reply.split("\nOr:")[0].strip()
    return reply


Comparative Reply Generation (with Sentiment)


In [None]:
comparative_data = []

for _, row in df.sample(3, random_state=42).iterrows():
    user_comment = row["text"]
    sentiment = row["predicted_sentiment"]

    similar_reviews = search_similar(user_comment)

    flan_prompt = build_prompt_with_sentiment(user_comment, sentiment, similar_reviews)
    flan_reply = generate_response_flan(flan_prompt)

    zephyr_prompt = build_prompt_zephyr_with_sentiment(user_comment, sentiment, similar_reviews)
    zephyr_reply = generate_response_zephyr(zephyr_prompt)

    comparative_data.append({
        "User Comment": user_comment,
        "Sentiment": sentiment,
        "Flan-T5 Reply": flan_reply,
        "Zephyr Reply": zephyr_reply
    })

 User comment:
great card (32gb) i've always bought sandisk sd cards, and have never been disappointed. this one is no exception. i'm using it in my galaxy s iii and have no problems. transfer speeds are as advertised. great card, great price.

 Similar reviews found:

1. great card (32gb) i've always bought sandisk sd cards, and have never been disappointed. this one is no exception. i'm using it in my galaxy s iii and have no problems. transfer speeds are as advertised. great card, great price.

2. the best sd card i've ever had i'm using this sd card in my samsung galaxy s3 instead of another one i bought with the same manufacture "sandisk 32 gb mobile microsdhc flash memory card sdsdq-032g-affp" ( working perfectly since october 2012.

3. amazing micro sdhc card i've used this in an assortment of phones and other devices and it works flawlessly and has good transfer speeds, not once have i ever had a problem with this card or any of the many sandisk cards i've used, i will keep usi



 User comment:
very fast memory very impressed with the speed of this 32 gb micro sdhc. very good performance for data read and write .

 Similar reviews found:

1. very fast memory very impressed with the speed of this 32 gb micro sdhc. very good performance for data read and write .

2. very fast and a lot of space! this is the best micro memory for an android phone! it's very fast running as fast as 38mb/s. the 30gb was a splurge but if you have movies or videos on your phone than this is perfect! amazing product!p. s. do not get the ultra mobile version of this, it's slower than this one

3. fast and lots of capacity. this is a great microsd card. it is fast and the 16 gb gives me lots of capacity on my smartphone.

 User comment:
htc evo v works fine my virgin mobile htc evo v. the phone didn't recognize it at first, but after formatting it's all good.

 Similar reviews found:

1. htc evo v works fine my virgin mobile htc evo v. the phone didn't recognize it at first, but after fo

In [None]:
 comparison_df = pd.DataFrame(comparative_data)
 pd.set_option('display.max_colwidth', None)
# comparison_df.head()

### Perplexity-only Evaluation (Flan and Zephyr Replies)
This step computes the fluency of generated replies from both Flan-T5 and Zephyr using the GPT-2 model.

The lower the perplexity, the more fluent and natural the response is.

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# Load GPT-2 model and tokenizer for perplexity scoring
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_model.eval()

def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Compute perplexity for both Flan and Zephyr replies
flan_perplexities = [calculate_perplexity(reply) for reply in comparison_df["Flan-T5 Reply"]]
zephyr_perplexities = [calculate_perplexity(reply) for reply in comparison_df["Zephyr Reply"]]

# Add results to the DataFrame
comparison_df["Flan Perplexity"] = flan_perplexities
comparison_df["Zephyr Perplexity"] = zephyr_perplexities

# Preview final comparison table
comparison_df[["User Comment", "Sentiment", "Flan-T5 Reply", "Flan Perplexity", "Zephyr Reply", "Zephyr Perplexity"]]


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Unnamed: 0,User Comment,Sentiment,Flan-T5 Reply,Flan Perplexity,Zephyr Reply,Zephyr Perplexity
0,"great card (32gb) i've always bought sandisk sd cards, and have never been disappointed. this one is no exception. i'm using it in my galaxy s iii and have no problems. transfer speeds are as advertised. great card, great price.",positive,"i love the sandisk sd cards i have, and am very happy with it.",122.9627,"Dear valued customer,\n\nWe're thrilled to hear that you're satisfied with your Sandisk SD card. It's great to hear that you've always had positive experiences with our products, and we're glad that this card continues to meet your expectations. Thank you for choosing Sandisk, and we hope you continue to have a wonderful experience with our products. If you ever have any further questions or concerns, please don't hesitate to reach out to",9.10477
1,very fast memory very impressed with the speed of this 32 gb micro sdhc. very good performance for data read and write .,positive,great memory!,2623.444336,We're thrilled to hear that you're thoroughly impressed with the speed and performance of your new 32GB micro SDHC card! It's always a pleasure to see such positive feedback from our valued customers. Keep enjoying that lightning-fast data transfer!,25.209906
2,"htc evo v works fine my virgin mobile htc evo v. the phone didn't recognize it at first, but after formatting it's all good.",positive,"htc evo v works fine. my virgin mobile htc evo v. the phone didn't recognize it at first, but after formatting it's all good.",131.129196,(For positive reviews)\nHi there! We're thrilled to hear that your HTC EVO V is working great for you! We're glad that the formatting process helped resolve any initial recognition issues. Enjoy your device!\n\n(For similar reviews)\nHi there! We're happy to hear that your HTC EVO V is working well for you on Virgin Mobile! Don't hesitate to reach out if you have any further questions or concerns.,24.026869


### Analysis of Perplexity Results

We evaluated the fluency of generated replies using **GPT-2 perplexity scores**:

- **Lower perplexity = more fluent and natural text.**

#### Observations:
- Zephyr consistently achieves lower perplexity scores (≈13–20), indicating smoother and more coherent replies.
- Flan-T5 shows mixed results: while its output is shorter, it sometimes lacks context or generates incoherent text.
- Example 2 from Flan ("Great microsd card.") has an extremely high perplexity (268), likely due to the short, out-of-context sentence.

**Zephyr performs better in terms of fluency**, especially when combining context and sentiment.


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "HuggingFaceH4/zephyr-7b-beta"

zephyr_tokenizer = AutoTokenizer.from_pretrained(model_id)
zephyr_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")

zephyr_tokenizer.save_pretrained("zephyr_generator_fp32")
zephyr_model.save_pretrained("zephyr_generator_fp32")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [2]:
!zip -r zephyr_generator_fp32.zip zephyr_generator_fp32
from google.colab import files
files.download("zephyr_generator_fp32.zip")

  adding: zephyr_generator_fp32/ (stored 0%)
  adding: zephyr_generator_fp32/model-00005-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/special_tokens_map.json (deflated 76%)
  adding: zephyr_generator_fp32/tokenizer_config.json (deflated 68%)
  adding: zephyr_generator_fp32/model-00004-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/model-00006-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/model-00002-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/tokenizer.model (deflated 55%)
  adding: zephyr_generator_fp32/tokenizer.json (deflated 85%)
  adding: zephyr_generator_fp32/config.json (deflated 47%)
  adding: zephyr_generator_fp32/model-00003-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/model.safetensors.index.json (deflated 95%)
  adding: zephyr_generator_fp32/model-00001-of-00006.safetensors (deflated 53%)
  adding: zephyr_generator_fp32/generation_config.json (deflated 21%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>