# Zephyr 7b beta

uses Zephyr 7b alpha to answer all 100 questions related to documents on policies.

Memory 11gb vram
float 16

Ref:
https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha

https://huggingface.co/collections/HuggingFaceH4/zephyr-7b-6538c6d6d5ddd1cbb1744a66

https://github.com/huggingface/alignment-handbook

https://huggingface.co/TheBloke/zephyr-7B-alpha-GPTQ

Model Memory Requirements
https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha/discussions/21

new improved model
https://huggingface.co/HuggingFaceH4/zephyr-7b-beta




## setup environment

should print specs of computer

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


## load documents

read documents

In [None]:
# LOADING
# knows how to read/write info from/to the OS
import os
import json
from llama_index import SimpleDirectoryReader

# read the huggingf ace API
with open("../CONFIG_LIST.json", "r") as file:
    config = json.load(file)

# hugging api-key
os.environ['HUGGING_FACE_API_KEY'] = config["huggingFaceToken"]
HUGGING_FACE_API_KEY = os.environ.get("HUGGING_FACE_API_KEY")

# load the docs into memory
documents = SimpleDirectoryReader('../documents').load_data() # reads whole drectory.

## Download the LLM

TODO:
- dl model to .cach 
- dl to folder

https://huggingface.co/docs/transformers/v4.35.0/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline


In [None]:
# from transformers import pipeline

# pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")




In [None]:

from huggingface_hub import hf_hub_download


# select a model
model_id = "HuggingFaceH4/zephyr-7b-beta"

# in the files section you have 2 model types 
# .safetensors (uses tensorflow) or .bin (uses pytoch)


# select files from model repository
# always need
# "added_tokens.json", "config.json", "generation_config.json", "special_tokens_map.json",
# download the model pytorch edition changers model to model
# pytorch_model-00002-of-00008.bin etc
# other tonken config files

filenames = [
        "added_tokens.json", "config.json", "generation_config.json", "special_tokens_map.json", "pytorch_model.bin.index.json",
        "pytorch_model-00001-of-00008.bin", "pytorch_model-00002-of-00008.bin", "pytorch_model-00003-of-00008.bin", "pytorch_model-00004-of-00008.bin", "pytorch_model-00005-of-00008.bin", "pytorch_model-00006-of-00008.bin", "pytorch_model-00007-of-00008.bin", "pytorch_model-00008-of-00008.bin",
        "tokenizer_config.json", "tokenizer.model","tokenizer.json", "added_tokens.json"
]

# download it
# hf_hub_download modify to save in a folder ./models, git ignore
for filename in filenames:
    downloaded_model_path = hf_hub_download(
        repo_id=model_id,
        filename=filename,
        token=HUGGING_FACE_API_KEY
    )
    print(downloaded_model_path)

## 

## PromptTemplate gives the LLM a personality

In [None]:
# setup prompts - specific to StableLM
from llama_index.prompts import PromptTemplate

# This will wrap the default prompts that are internal to llama-index
# query_wrapper_prompt = PromptTemplate(
#     "As a Human Resources expert"
#     "write a response that appropriately completes the request.\n\n"
#     "### Instruction:\n{query_str}\n\n### Response:"
# )

# This will wrap the default prompts that are internal to llama-index
# query_wrapper_prompt = PromptTemplate(
#     "As a Human Resources expert who tactfully answers questions with professionalism, accuracy, integrity ethics and honesty"
#     "write a response that appropriately completes the request.\n\n"
#     "### Instruction:\n{query_str}\n\n### Response:"
# )

# This will instruct the model on what to do and how to behave
query_wrapper_prompt = PromptTemplate(
    "As a Human Resources professional tactfully answer all questions clearly and concisely with professionalism, accuracy, integrity, ethics and honesty."
    "Keep the answer short and respond with 'Unsure about answer' if not sure about the answer.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)



# prompt engineering - gets better answers
# Might be able to give few shot examples so it understands how to answer questions  like a hr expert
# warm friendly deminer - add to personality

## embedding 

In [None]:
import torch
from llama_index.llms import HuggingFaceLLM
from llama_index import GPTVectorStoreIndex, PromptHelper, ServiceContext, LLMPredictor
from llama_index.embeddings import InstructorEmbedding
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# can cammer

# sets up the LLM model with context
# each model is different
# set temp to 0 = Repeatability 
# device_map = depends on model 
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_id,
    model_name=model_id,
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

# this grenades the vector indexes 
# max_chunk_overlap = 10% overlap to keep context of where everything came from
# chunk_size_limit = tokens
# TODO: save index to vector DB (Chroma)
# max_input_size and context_window same size
embed_model = InstructorEmbedding(model_name="hkunlp/instructor-base")
max_input_size = 2048
num_output = 256
max_chunk_overlap = 0.1
chunk_size_limit = 600
prompt_helper = PromptHelper(max_input_size, num_output,max_chunk_overlap,chunk_size_limit=chunk_size_limit)
service_context = ServiceContext.from_defaults(llm=llm, prompt_helper=prompt_helper,embed_model=embed_model)

# index content in the folder documents
# saves the document to memory
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
 
# Save your index to a directory called storage
# index.storage_context.persist(persist_dir="../storage")

## answer qustions

In [None]:

# prompt = "What is the policies in the folder"
# prompt = "how many files are being refernced"
# prompt = "when was DEWC founded"
# prompt = "when was DEWC launched"

prompt = "How many females work for DEWC"

query_engine = index.as_query_engine()
response = query_engine.query(prompt)

print(f"Question: {prompt}")
print(f"Answer: {response}\n")

# answers to csv file


In [None]:
import pandas as pd
import warnings

# Suppress warnings from the transformer model
warnings.filterwarnings('ignore')

# Read the contents of the questionnaire.txt file
with open('../questionnaire.txt', 'r') as file:
    questions = file.readlines()

# Initialize the query engine
query_engine = index.as_query_engine()

# Create a list to store the results
results_model = []

# Iterate over the questions and get answers from the query engine
for prompt in questions:
    prompt_cleaned = prompt.strip()
    
    # Query the engine and get the response
    response = query_engine.query(prompt_cleaned)
    
    # Append the question and response to the results list
    # Assuming 'model_id' is defined elsewhere in your script
    results_model.append([model_id, prompt_cleaned, response])
    
    # Print the question and the response
    print(f"Question: {prompt_cleaned}")
    print(f"Answer: {response}\n")

# Convert the results list into a pandas DataFrame
df = pd.DataFrame(results_model, columns=['Model ID', 'Question', 'Answer'])

# Save the DataFrame to a CSV file
df.to_csv('../results_model.csv', index=False)

print("Results saved to CSV file.")


In [None]:
results_model

# print time taken 
# memeory
# 

In [None]:
# 1: panda to csv
# merge
# new branch zepher