In [11]:
import streamlit as st
import pypdf
import os
from llama_cpp import Llama
import pydantic
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp
from langchain_community.embeddings import LlamaCppEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain_community.document_loaders import PyPDFLoader
from tempfile import NamedTemporaryFile
from langchain.vectorstores import Chroma, FAISS
import json

# from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitters

model_name = "/home/gs/hf_home/models/models--google--gemma-2b-it/gemma-2b-it.gguf"
model_name_embed = "/home/gs/hf_home/models/models--google--gemma-2b/gemma-2b.gguf"

#define consistent parametes
# n_batch >= chunk-size
chunk_size = 512

# lm_embed_model = LlamaCppEmbeddings(model_path = model_name, n_gpu_layers = -1, n_ctx = 512 * 4, n_batch = chunk_size, verbose=True)

llm_chat_model = LlamaCpp(
        model_path=model_name,
        n_gpu_layers=-1,
        # n_batch = chunk_size,
        # callback_manager=callback_manager,
        n_ctx=1024*2, # Uncomment to increase the context window
        # temperature=0.75,
        # f16_kv=True,
        verbose=True,  # Verbose is required to pass to the callback manager
)


llama_model_loader: loaded meta data with 19 key-value pairs and 164 tensors from /home/gs/hf_home/models/models--google--gemma-2b-it/gemma-2b-it.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma
llama_model_loader: - kv   1:                               general.name str              = gemma-2b-it
llama_model_loader: - kv   2:                       gemma.context_length u32              = 8192
llama_model_loader: - kv   3:                          gemma.block_count u32              = 18
llama_model_loader: - kv   4:                     gemma.embedding_length u32              = 2048
llama_model_loader: - kv   5:                  gemma.feed_forward_length u32              = 16384
llama_model_loader: - kv   6:                 gemma.attention.head_count u32              = 8
llama_model_loader: - kv   7:            

In [20]:
customer_name = "david"
ai_persona_profile = """VP of Marketing, harsh in replies, loves numbers and facts, uses light-hearted humor, and is generally skeptical about offers from cold calls. David is a marketing expert with ten years of experience and two years at his current position, building the content marketing dep from zero."
"""

company_name = "zeroshot"
goal_role_play = "My sales reps need to re-engage with the churned customer and try to book the demo with our regional account executive."
role_play_background = "Currently, Quantum company is using Salesforce as their CRM. Still, they lack a call recording feature to streamline the coaching of reps. Besides, David posted on LinkedIn which services people suggest, and somebody recommended checking out Gong and Zoominfo. Our sales rep decided to give a call."
criteria_win = """‍My rep will need to ask open-ended questions on budget, authority, timeline, and needs. To complete discovery, they must ask "what if" and "how do you make sure" questions."""
your_list = """It's expensive, and I expected this project would cost 35k;
I am happy with the current solution;
I am in a hurry. Send me the email;
I need to talk with my manager before making the payment.
"""
first_buyer = "I expected this project would cost 35k"

prompt_context = f"""
Your role is customer {customer_name}, and you are:
{ai_persona_profile};

User will be sales representative of {company_name} and the goal is {goal_role_play};

Background of this coversation:
{role_play_background};

To achieve the goal, sales rep needs to complete these steps:
{criteria_win};

Additionally, you will give these objections during your conversation:
{your_list};

The user will start the conversation with an introduction, and you should respond with {first_buyer}.
"""

In [22]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

template = """Answer the question based on provided situation: 
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

setup_and_retrieval = RunnableParallel(
    {"question": RunnablePassthrough(), "context": RunnablePassthrough()}
)
chain = setup_and_retrieval | prompt | llm_chat_model | output_parser

query = "What did the president say about Ketanji Brown Jackson"

chain.invoke({"question": query, "context": prompt_context})

Llama.generate: prefix-match hit

llama_print_timings:        load time =     236.75 ms
llama_print_timings:      sample time =      12.40 ms /    12 runs   (    1.03 ms per token,   967.43 tokens per second)
llama_print_timings: prompt eval time =   22052.34 ms /   731 tokens (   30.17 ms per token,    33.15 tokens per second)
llama_print_timings:        eval time =    1517.05 ms /    11 runs   (  137.91 ms per token,     7.25 tokens per second)
llama_print_timings:       total time =   24481.26 ms /   742 tokens


'I expected this project would cost 35k.'

In [27]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful role-play assistant. Answer all questions to the best of your ability.",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | llm_chat_model

chain.invoke(
    {
        "messages": [
            HumanMessage(
                content=prompt_context
            ),
            # AIMessage(content=prompt_context),
            # HumanMessage(content="What did you just say?"),
        ],
    }
)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     236.75 ms
llama_print_timings:      sample time =     252.16 ms /   256 runs   (    0.99 ms per token,  1015.22 tokens per second)
llama_print_timings: prompt eval time =   10061.41 ms /   341 tokens (   29.51 ms per token,    33.89 tokens per second)
llama_print_timings:        eval time =   34967.00 ms /   255 runs   (  137.13 ms per token,     7.29 tokens per second)
llama_print_timings:       total time =   47833.65 ms /   596 tokens


'How can you use the information from the user\'s questions to tailor the conversation?\n\nSure, here\'s how I can use the information from the user\'s questions to tailor the conversation:\n\n**1. Understand their budget and authority.**\n\n* If the user mentions an amount like 35k, ask them to explain why they believe it\'s expensive. \n* If they hesitate to disclose their budget, mention that it\'s a common concern for startups and ask them about their overall financial situation.\n\n**2. Acknowledge their current solution.**\n\n* Ask them how they are currently tracking and managing their leads and customer interactions. \n* Mention that Quantum\'s lack of a call recording feature could be hindering their coaching process.\n\n**3. Emphasize the value of the demo.**\n\n* Explain how a demo can help them save time and money by streamlining the sales process. \n* Highlight the benefits of the demo, such as getting to know the regional account executive, learning about Quantum\'s produ

In [29]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.llama.modeling_llama because of the following error (look up to see its traceback):
/home/gs/miniconda3/envs/t2v/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c104cuda20CUDACachingAllocator9allocatorE