# LLM Study

## Setup

In [1]:
#install libraries to virtual env
!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install langchain

In [1]:
print('done')

done


In [2]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!python -V

Python 3.10.13


## Llama2 7b chat hf model

In [4]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float32,
                                             #torch_dtype=torch.float16,
                                             use_auth_token=True,
                                            #  load_in_8bit=True,
                                            #  load_in_4bit=True
                                             )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [6]:
# Use a pipeline for later
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [7]:
!nvidia-smi

Wed Jan 31 14:14:58 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.13                 Driver Version: 537.13       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080      WDDM  | 00000000:01:00.0 Off |                  N/A |
|  0%   29C    P5              15W / 320W |  13210MiB / 16376MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [23]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""



def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text['text'], width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text


In [9]:
#default system prompt
instruction = "What is the temperature in Seoul?"

get_prompt(instruction)

"[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\nWhat is the temperature in Seoul?[/INST]"

In [10]:
#specific system prompt
instruction = "Summarize the following text for me {text}"

system_prompt = "You are an expert and summarization and reducing the number of words used"

get_prompt(instruction, system_prompt)

'[INST]<<SYS>>\nYou are an expert and summarization and reducing the number of words used\n<</SYS>>\n\nSummarize the following text for me {text}[/INST]'

In [15]:
# !!! SKIP. will result in Out of Memory error
#Response Generation Test with cuda on local GPU 16GB
text = "how are you today?"
output = generate(text)

parse_text(output)

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacty of 15.99 GiB of which 0 bytes is free. Of the allocated memory 45.36 GiB is allocated by PyTorch, and 56.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
torch.cuda.is_available()

True

In [12]:
torch.__version__

'2.1.2+cu118'

## LangChain basics


In [13]:
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain


In [14]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

In [15]:
system_prompt = "You are an advanced assistant that excels at translation. "
instruction = "Convert the following text from English to French:\n\n {text}"
template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

[INST]<<SYS>>
You are an advanced assistant that excels at translation. 
<</SYS>>

Convert the following text from English to French:

 {text}[/INST]


In [19]:
#invoke vs run(deprecated in LangChain 0.2.0) results

text = "how are you today?"
output = llm_chain.invoke(text)

#parse_text(output)
print(output)

{'text': '  Bien sûr! Here is the translation of "how are you today?" in French:\n\nComment allez-vous aujourd\'hui?'}


In [20]:
#test #2

text = "I want to eat steak"
output = llm_chain.run(text)

print(output)
#parse_text(output)

  Sure! Here is the translation of "I want to eat steak" from English to French:

Je veux manger de la viande de cheval.


## Summary

In [21]:
instruction = "Summarize the following article for me {text}"
system_prompt = "You are an expert and summarization and expressing key ideas succintly"

template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

[INST]<<SYS>>
You are an expert and summarization and expressing key ideas succintly
<</SYS>>

Summarize the following article for me {text}[/INST]


In [22]:
def count_words(input_string):
    words = input_string.split(" ")
    return len(words)

text = '''Twitter (now X) CEO Linda Yaccarino claims usage at ‘all time high’ in memo to staff
Twitter’s (now X’s) newly established CEO Linda Yaccarino touts the company’s success and X’s future plans in a company-wide memo obtained by CNBC. The exec once again claims, without sharing any specific metrics, that the service’s usage is at an “all time high,” and hints at what’s to come in terms of new product experiences for the newly rebranded platform.

The service formerly known as Twitter has been working to become more than just a social network and more of an “everything app,” as owner Elon Musk dubbed it.

As the Telsa and Space X exec explained in October 2022, telegraphing Twitter’s eventual rebranding, buying Twitter was meant to be “an accelerant to creating X, the everything app.”


His grand plan has been to create an app that allows creators to monetize their content, then later moves into payments services and even banking, Musk remarked during a Twitter Spaces livestream with advertisers in November. At the time, he even mentioned the possibility of establishing money market accounts on Twitter that would pay a high-interest rate to attract consumers to X.

Those possible product concepts were again referenced in Yaccarino’s new missive, when she writes, “Our usage is at an all time high and we’ll continue to delight our entire community with new experiences in audio, video, messaging, payments, banking – creating a global marketplace for ideas, goods, services, and opportunities.”

Twitter, now X, has already implemented some of Musk’s ideas around videos and creator monetization. In May, the company began allowing subscribers to upload two-hour videos to its service, which advertiser Apple then leveraged when it released the entire first episode of its hit Apple TV+ show “Silo” on the platform. Fired Fox News host Tucker Carlson had been posting lengthy videos to Twitter as well, until ordered to stop by the network.

In addition, earlier this month, Twitter began sharing ad revenue with verified creators.

However, all is not well at Twitter X, whose traffic — at least by third-party measurements — has been dropping. Data from web analytics firm Similarweb indicated Twitter’s web traffic declined 5% for the first two days its latest rival, Instagram Threads, became generally available, compared with the week prior. Plus, Similarweb said Twitter’s web traffic was down 11% compared with the same days in 2022. Additionally, Cloudflare CEO Matthew Prince earlier this month tweeted a graph of traffic to the Twitter.com domain that showed “Twitter traffic tanking,” he said.


Yaccarino subtly pushed back at those reports at the time, claiming that Twitter had its largest usage day since February in early July. She did not share any specific metrics or data. At the same time, however, the company was quietly blocking links to Threads.net in Twitter searches, suggesting it was concerned about the new competition.

Today, Yaccarino repeats her vague claims around X’s high usage in her company-wide memo even as analysts at Forrester are predicting X will either shut down or be acquired within the next 12 months and numerous critics concur that the X rebrand is destined to fail.

Yaccarino’s memo, otherwise, was mostly a lot of cheerleading, applauding X’s team for their work and touting X’s ability to “impress the world all over again,” as Twitter once did.

The full memo, courtesy of CBNC, is below:

Hi team,

What a momentous weekend. As I said yesterday, it’s extremely rare, whether it’s in life or in business, that you have the opportunity to make another big impression. That’s what we’re experiencing together, in real time. Take a moment to put it all into perspective.

17 years ago, Twitter made a lasting imprint on the world. The platform changed the speed at which people accessed information. It created a new dynamic for how people communicated, debated, and responded to things happening in the world. Twitter introduced a new way for people, public figures, and brands to build long lasting relationships. In one way or another, everyone here is a driving force in that change. But equally all our users and partners constantly challenged us to dream bigger, to innovate faster, and to fulfill our great potential.

With X we will go even further to transform the global town square — and impress the world all over again.

Our company uniquely has the drive to make this possible. Many companies say they want to move fast — but we enjoy moving at the speed of light, and when we do, that’s X. At our core, we have an inventor mindset — constantly learning, testing out new approaches, changing to get it right and ultimately succeeding.

With X, we serve our entire community of users and customers by working tirelessly to preserve free expression and choice, create limitless interactivity, and create a marketplace that enables the economic success of all its participants.

The best news is we’re well underway. Everyone should be proud of the pace of innovation over the last nine months — from long form content, to creator monetization, and tremendous advancements in brand safety protections. Our usage is at an all time high and we’ll continue to delight our entire community with new experiences in audio, video, messaging, payments, banking – creating a global marketplace for ideas, goods, services, and opportunities.

Please don’t take this moment for granted. You’re writing history, and there’s no limit to our transformation. And everyone, is invited to build X with us.

Elon and I will be working across every team and partner to bring X to the world. That includes keeping our entire community up to date, ensuring that we all have the information we need to move forward.

Now, let’s go make that next big impression on the world, together.

Linda'''

count_words(text)

940

In [25]:
output = llm_chain.run(text)
print(count_words(output))
print(output)
#parse_text(output)

148
  In a recent company-wide memo, Twitter's newly appointed CEO Linda Yaccarino claimed that the platform's usage is at an "all-time high" without providing specific metrics. Yaccarino touted the company's success and future plans, including the creation of a "global marketplace for ideas, goods, services, and opportunities" through the integration of new product experiences such as audio, video, messaging, payments, and banking. Despite traffic declines according to third-party measurements, Yaccarino pushed back at the reports and claimed that Twitter has its largest usage day since February. The memo also referenced Elon Musk's grand plan to create an "everything app" and hinted at the possibility of establishing money market accounts on Twitter that would pay a high-interest rate to attract consumers. Overall, Yaccarino's memo was a mix of cheerleading and vague claims of success, as the company faces competition from new rivals and criticisms of its rebranding efforts.


## chatbot

In [26]:
from langchain.memory import ConversationBufferMemory
from langchain import LLMChain, PromptTemplate

In [27]:
instruction = "Chat History:\n\n{chat_history} \n\nUser: {user_input}"
system_prompt = "You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context"

template = get_prompt(instruction, system_prompt)
print(template)

[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

{chat_history} 

User: {user_input}[/INST]


In [28]:

prompt = PromptTemplate(
    input_variables=["chat_history", "user_input"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")

In [29]:
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory,
)

In [30]:
llm_chain.predict(user_input="Hi, my name is Sam")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

 

User: Hi, my name is Sam[/INST][0m

[1m> Finished chain.[0m


'  Hello Sam! *smiling* How may I assist you today?'

In [31]:
llm_chain.predict(user_input="Can you tell me about yourself.")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How may I assist you today? 

User: Can you tell me about yourself.[/INST][0m

[1m> Finished chain.[0m


"  Of course! *smiling* I'm just an AI assistant, here to help you with any questions or tasks you may have. I'm a machine learning model trained on a diverse range of topics and tasks, so I can assist you in many different areas. Is there something specific you would like to know or discuss?"

In [33]:
llm_chain.predict(user_input="Today is Friday. What number day of the week is that?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How may I assist you today?
Human: Can you tell me about yourself.
AI:   Of course! *smiling* I'm just an AI assistant, here to help you with any questions or tasks you may have. I'm a machine learning model trained on a diverse range of topics and tasks, so I can assist you in many different areas. Is there something specific you would like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   AI: Ah, a clever question! *smiling* Today is indeed Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss? 

User: Today is Friday. What number day of the week is that?[/INST][0m





[1m> Finished chain.[0m


"  Hello! *smiling* You're right, today is Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss?"

In [34]:
llm_chain.predict(user_input="what is the day today?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How may I assist you today?
Human: Can you tell me about yourself.
AI:   Of course! *smiling* I'm just an AI assistant, here to help you with any questions or tasks you may have. I'm a machine learning model trained on a diverse range of topics and tasks, so I can assist you in many different areas. Is there something specific you would like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   AI: Ah, a clever question! *smiling* Today is indeed Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   Hello! *smiling* You're right, toda




[1m> Finished chain.[0m


"  AI:  Ah, a straightforward question! *smiling* Today is Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss?"

In [35]:
llm_chain.predict(user_input="What is my name?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How may I assist you today?
Human: Can you tell me about yourself.
AI:   Of course! *smiling* I'm just an AI assistant, here to help you with any questions or tasks you may have. I'm a machine learning model trained on a diverse range of topics and tasks, so I can assist you in many different areas. Is there something specific you would like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   AI: Ah, a clever question! *smiling* Today is indeed Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   Hello! *smiling* You're right, toda




[1m> Finished chain.[0m


"  AI:  Hello! *smiling* Your name is Sam. *thoughtful* Is there anything else you'd like to know or discuss?"

In [36]:
llm_chain.predict(user_input="Can you tell me about the olympics")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How may I assist you today?
Human: Can you tell me about yourself.
AI:   Of course! *smiling* I'm just an AI assistant, here to help you with any questions or tasks you may have. I'm a machine learning model trained on a diverse range of topics and tasks, so I can assist you in many different areas. Is there something specific you would like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   AI: Ah, a clever question! *smiling* Today is indeed Friday, which means it's the fifth day of the week. *thoughtful* Is there anything else you'd like to know or discuss?
Human: Today is Friday. What number day of the week is that?
AI:   Hello! *smiling* You're right, toda




[1m> Finished chain.[0m


"  Of course! *smiling* The Olympics are a major international multi-sport event held every four years, where athletes from around the world compete in a variety of sports. The Olympics have a rich history, dating back to ancient Greece, and have evolved over time to include a wide range of sports and events. The modern Olympics have been held every four years since 1896, with the exception of 1916, 1940, and 1944, when they were cancelled due to World War I and World War II.\n\nThe Olympics feature a wide range of sports, including track and field, swimming, gymnastics, basketball, soccer, and many others. The events are held in a specific host city, which is responsible for organizing and hosting the Games. The Olympics are known for their grandeur and spectacle, with elaborate opening and closing ceremonies, as well as the iconic lighting of the Olympic torch.\n\nThe Olympics also have a strong cultural and symbolic significance, representing the coming together of nations and athle

In [44]:
llm_chain.predict(user_input="What have we talked about in this Chat?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m[INST]<<SYS>>
You are a helpful assistant, you always only answer for the assistant then you stop. read the chat history to get context
<</SYS>>

Chat History:

Human: Hi, my name is Sam
AI:   Hello Sam! *smiling* How can I assist you today?
Human: Can you tell me about yourself.
AI:   Of course! *smiling* I'm just an AI, I don't have a personal life or experiences like humans do. My purpose is to assist and provide helpful responses to users like you. I'm here to help with any questions or tasks you may have, so feel free to ask me anything! *wink*
Human: Today is Friday. What number day of the week is that?
AI:   AI: Great question! Friday is the 5th day of the week. *smiling*
Human: what is the day today?
AI:   Hello! *smiling* Today is Friday. *wink*
Human: What is my name?
AI:   AI: Hello! *smiling* Your name is Sam. *wink*
Human: Can you tell me about the olympics
AI:   Of course! *smiling* The Olymp

"  Hello! *smiling* In this chat, we have talked about:\n\n1. Introduction: I introduced myself and asked you to introduce yourself.\n2. Personal Information: You asked me about myself, and I told you that I'm just an AI and don't have personal experiences or a personal life.\n3. Days of the Week: You asked me the number of the day of the week, and I told you that Friday is the 5th day of the week.\n4. Name: You asked me your name, and I told you that your name is Sam.\n5. Olympics: You asked me about the Olympics, and I gave you a brief overview of the event, including its history and evolution.\n\nI hope that helps! *wink* Is there anything else you would like to know or discuss?"

In [45]:
!nvidia-smi

Sun Jan 28 22:33:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.13                 Driver Version: 537.13       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080      WDDM  | 00000000:01:00.0 Off |                  N/A |
|  0%   40C    P0              38W / 320W |    812MiB / 16376MiB |      7%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Run Llama using Text Generation Inference for quicker response time

In [2]:
import requests

headers = {
    "Content-Type": "application/json",
}

data = {
    'inputs': 'What is Deep Learning?',
    'parameters': {
        'max_new_tokens': 100,
    },
}

response = requests.post('http://127.0.0.1:8080/generate', headers=headers, json=data)
print(response.json())
# {'generated_text': '\n\nDeep Learning is a subset of Machine Learning that is concerned with the development of algorithms that can'}

{'generated_text': '\n\nDeep learning\xa0(also known as\xa0deep structured learning) is part of a broader family of\xa0machine learning\xa0methods based on\xa0artificial neural networks\xa0with\xa0representation learning. Learning can be\xa0supervised,\xa0semi-supervised\xa0or\xa0unsupervised.\n\nDeep-learning architectures such as\xa0deep neural networks,\xa0deep belief networks,\xa0deep reinforcement learning,\xa0recurrent neural networks,\xa0convolution'}
