In [3]:
import os

from llama_cpp import Llama
model_path = "./models/llama-2-13b-chat.Q4_K_M.gguf"
model: Llama = Llama(model_path="./models/llama-2-13b-chat.Q4_K_M.gguf")

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./models/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32      

In [6]:
#want to reduce the veerbosity of the output a bit

model.verbose = False

#Now that we have a model let's actually do some inference! The
# method we use for this is called create_completion(), and by
#defualt we only neet to pass it a prompt to complete and it
# returns to us a Completion object, which is a TypedDict.
from llama_cpp.llama_types import *

result: Completion = model.create_completion(prompt="the capital of iran is ")
#the completion type has a choice key which shows us the list of 
#responses the LLM genrated, let's take a look
print(result["choices"])

[{'text': '425 miles from the city of esfahan.\nThe capital of', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}]


In [10]:
# We can make the model behave more deterministically by setting its
# temperature, which is a parameter from 0 to 1,
# where values closer to 0 cause the model to behave more deterministically.
# And values closer to 1 cause the model to behave more nondeterministic or creative. 
######################################
# So let's try a few different temperature values, I'm just going to create a list of
# floats here, we're going to go from 0 to 0.5 to 1. 
temps: list[float] = [0.0, 0.5, 1.0]

# Now, for each of these temperatures, we're going to do three different completions,
# and it really is useful to think of this as a completion.
# We're going to give the prompt, it's going to be turned into a sequence of tokens,
# and the model underneath is going to just create more tokens for that sequence, and
# I'm going to print those all out. 

prompt: str = "the planet in the solar system include "
for temp in temps:
    for i in range(0 , 3):
        result: Completion = model.create_completion(prompt=prompt, temperature=temp)
        print(f'temp={temp}, run={i}, result: {result["choices"][0]["text"]}')



temp=0.0, run=0, result: 1) Mercury, 2) Venus, 3) Earth,
temp=0.0, run=1, result: 1) Mercury, 2) Venus, 3) Earth,
temp=0.0, run=2, result: 1) Mercury, 2) Venus, 3) Earth,
temp=0.5, run=0, result: 1) Mercury, 2) Venus, 3) Earth,
temp=0.5, run=1, result: 8 planets, dwarf planets and other smaller bodies. The 
temp=0.5, run=2, result: 1) Mercury, the closest planet to the sun, with a highly ecc
temp=1.0, run=0, result: 1. Mercury: Mercury is closest planet to the Sun. It is
temp=1.0, run=1, result: 28 moons, five dwarf planets, asteroids, and
temp=1.0, run=2, result: 90,934 and 106,400.


In [11]:
# Generating tokens is slow, and by default the create completion
# method limits the number of tokens returned to just 16.
# If we change this to -1, we can generate as many tokens as we want,
# as many as are available from the model and the context window.
# Let's just do one run here,
# I'm going to leave the temperature at its default value, which is 0.8. 
result: Completion = model.create_completion(prompt=prompt, max_tokens=-1)

#let's see what we got
print(result["choices"][0]["text"])

8 planets, dwarf planets, asteroids, comets and other smaller bodies.
The largest planet in our solar system is Jupiter which has a diameter of more than 142,984 km. The smallest planet is Mercury with a diameter of approximately 4,879km.
The distance between the sun and Earth is approximately 93 million miles or 150 million kilometers. It takes Earth approximately 365 days to complete one orbit around the sun.
The surface temperature on Venus is about 462 degrees Celsius while the average temperature on Mars is -67 degrees Celsius. Mercury has no atmosphere and can reach temperatures as high as 427 degrees Celsius during the day and as low as -173 degrees Celsius at night.
The largest moon in our solar system is Ganymede, which orbits Jupiter and has a diameter of approximately 5,262km. The smallest planet in our solar system is Mercury with no moons.
In conclusion, there are numerous interesting facts about the planets in our solar system that make them unique and fascinating. Studyi

In [15]:
#############cell for live chat##################


# When you create a new Llama CPP object,
# the default context length is set to 512 tokens okay, let's do one last demo.
# And here I want to show you that we actually don't have to wait for
# the whole query to finish, but instead can use the streaming features of
# Llama CPP to see the tokens as they're completed. 


# So I'm going to create a new model, very similar,
# I'm setting a nice big context window here. 
model: Llama = Llama(model_path="./models/llama-2-13b-chat.Q4_K_M.gguf", verbose=False , n_ctx=4096)

# If we pass this stream = true parameter to create completion,
# we're actually going to get back an iterator and
# it's going to iterate over these create completion stream response objects.
# Again, all of these objects are actually just typed dictionaries and
# this one's similar to the completion type. 

#So I'm going to set a token count here
token_count: int = 0

#Then I'm just going to iterate
# over all of the tokens that come back in this create completion.
for result in model.create_completion(
    prompt="some fun things to do for vacation in the iran includes ",
    max_tokens=-1,
    stream=True,
):
# Importantly here, I've just got this little mod, 50 = 0, that's just so that I
# can print new line characters, otherwise everything's going to be on one line. 
    if token_count % 50 == 0 :
        print("")
    token_count = token_count +1
    print(result["choices"][0]["text"], end="")


 visiting historical sites, explore natural wonders, and experiencing the local culture.
1- Visit Historical Sites: Iran is home to many ancient historical sites that are UNESCO World Heritage Sites, including Persepolis
, Pasargadae, and Meidan Emam (Naqsh-e Jahan) in Isfahan. These sites offer a glimpse into the country's rich history and culture.
2- Explore Natural
 Wonders: Iran is home to many natural wonders such as Dasht-e Kavir, a vast desert; Alborz Mountains, the highest mountain range in the country; and the Caspian Sea, the world's largest
 saltwater lake. These natural wonders offer opportunities for hiking, camping, and water sports.
3- Experience Local Culture: Iran is known for its rich cultural heritage, including traditional music, dance, literature,
 and art. Visitors can experience the local culture by attending a traditional Persian wedding, watching a performance of classical music and dance, or visiting a local bazaar.
4- Try Local Cuisine: Iranian cu
isine is k

In [None]:
#############cell for live chat##################


# When you create a new Llama CPP object,
# the default context length is set to 512 tokens okay, let's do one last demo.
# And here I want to show you that we actually don't have to wait for
# the whole query to finish, but instead can use the streaming features of
# Llama CPP to see the tokens as they're completed. 


# So I'm going to create a new model, very similar,
# I'm setting a nice big context window here. 
model: Llama = Llama(model_path="./models/llama-2-13b-chat.Q4_K_M.gguf", verbose=False , n_ctx=4096)

# If we pass this stream = true parameter to create completion,
# we're actually going to get back an iterator and
# it's going to iterate over these create completion stream response objects.
# Again, all of these objects are actually just typed dictionaries and
# this one's similar to the completion type. 

#So I'm going to set a token count here
token_count: int = 0

#Then I'm just going to iterate
# over all of the tokens that come back in this create completion.
for result in model.create_completion(
    prompt="write me in persian",
    max_tokens=-1,
    stream=True,
):
# Importantly here, I've just got this little mod, 50 = 0, that's just so that I
# can print new line characters, otherwise everything's going to be on one line. 
    if token_count % 50 == 0 :
        print("")
    token_count = token_count +1
    print(result["choices"][0]["text"], end="")