In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
torch.cuda.get_device_name(0)
#ablation tool: https://transformer-circuits.pub/2021/garcon/index.html


'NVIDIA A100 80GB PCIe'

In [None]:
with open("requirements.txt", "r") as f:
    r = [x.strip().split("==")[0].split(">=")[0].split("[")[0] for x in f.readlines()][:-1]
    
import importlib
for name in r:
    module = importlib.import_module(name)
    try:
        print(f"{name}=={module.__version__}")
    except Exception as e:
        print(name)


accelerate==0.25.0
appdirs==1.4.4
loralib
bitsandbytes
black==23.12.0
black==23.12.0
datasets==2.15.0
fire==0.5.0
peft==0.7.1
transformers==4.36.2
sentencepiece==0.1.99
py7zr==0.20.8
scipy==1.11.4
optimum
openai==1.5.0
spacy==3.6.1


In [4]:
# ENV SET UP
# conda create -n llm_chat2 python=3.10 ipykernel ipywidgets
# conda activate llm_chat2
# conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.7 -c pytorch -c nvidia
# git clone https://github.com/cbjrobertson/llm_chat.git
# cd llm_chat
# pip install -r requirements.txt
# python setup.py install --record files.txt

In [6]:
# MODEL hf creation
# The first three model_names are local huggingface versions of the llama models, i.e. after downloading the meta weights, they were created following 
# the llama-recipes quickstart.ipynb directions, from: https://github.com/facebookresearch/llama-recipes/blob/main/quickstart.ipynb

# i.e.:
# %%bash
# pip install transformers datasets accelerate sentencepiece protobuf==3.20 py7zr scipy peft bitsandbytes fire torch_tb_profiler ipywidgets
# TRANSFORM=`python -c "import transformers;print('/'.join(transformers.__file__.split('/')[:-1])+'/models/llama/convert_llama_weights_to_hf.py')"`
# print(TRANSFORM)
# python ${TRANSFORM} --input_dir ./models/llama-2-70b-chat --model_size 70B --output_dir ./models_hf/70B_chat

# NB, for the above code to work, meta weights need to be stored in a `models` dir using the following structure, where
# each model is stored in a sub-directory in <<input_dir>> which matches the --model_size param, e.g. 7B, 13B, 70B:
# ├── models
# │   ├── llama-2-13b
# │   │   └── 13B
# │   ├── llama-2-13b-chat
# │   │   └── 13B
# │   ├── llama-2-70b
# │   │   └── 70B
# │   ├── llama-2-70b-chat
# │   │   └── 70B
# │   ├── llama-2-7b
# │   │   └── 7B
# │   └── llama-2-7b-chat
# │       └── 7B

In [3]:
# import from psy_llm_chat class to chat with GPT and Llama
import psy_llm_chat as plc
import openai

In [2]:
# Define a simple dialog
test_dialog = [
    # {'role': 'system', 'content': 'Always answer with Haiku'},
    {'role': 'user', 'content': "If you can hear me say hi."}
]

In [7]:
#  #local models 
# model_name ="./llama_models/models_hf/7B_chat"
# model_name ="./llama_models/models_hf/13B_chat"
# model_name ="./llama_models/models_hf/70B_chat"

# # huggingface
# models can also be loaded from HF hub (once access has been granted), e.g.:
model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_name = "meta-llama/Llama-2-13b-chat-hf"
# model_name = "meta-llama/Llama-2-70b-chat-hf"

# load the model defined by <<model_name>>, must set output_hidden_states and ...attentions to True to access these later.
lc = plc.LlamaChat(model_name, True, output_hidden_states=True, output_attentions=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# run the chat by calling the class
# to end the chat, input `END_THIS_NOW`; this terminates the chat and returns the dialog 
result = lc(test_dialog)

User: If you can hear me say hi.
Assistant: 
Hi there! *giggles* It's great to hear from you! How are you doing today? Is there anything on your mind that you want to talk about? I'm here to listen and help in any way I can. *smiles*


User:  END_THIS_NOW


In [10]:
# dialog is a list of dicts
result

[{'role': 'user', 'content': 'If you can hear me say hi.'},
 {'role': 'assistant',
  'content': "\nHi there! *giggles* It's great to hear from you! How are you doing today? Is there anything on your mind that you want to talk about? I'm here to listen and help in any way I can. *smiles*"},
 {'role': 'user', 'content': 'END_THIS_NOW'}]

In [11]:
# allign spacy token boundaries with  embeddings with sentence piece embeddings and expose as the spacy.Token._.llama_vec Token property
import spacy
nlp = spacy.load("en_core_web_sm")

# pass spacy nlp object, result[:-1] (to avoid encoding END_THIS_NOW) and layer number (i.e. the layer from which to extract hidden state
doc = lc.get_spacy_doc(nlp,result[:-1],layer=0)

Returning encoding for:

--------------------

[INST] If you can hear me say hi. [/INST]
 Hi there! *giggles* It's great to hear from you! How are you doing today? Is there anything on your mind that you want to talk about? I'm here to listen and help in any way I can. *smiles* 

--------------------


In [12]:
# demonstration of how to access Token._.llama_vec embedding
word_index = 9
print(f"""Average of sentence-piece embeddings for: `{doc[word_index]}`
at doc index: `{word_index}`
is of type: `{type(doc[5]._.llama_vec)}`
of shape: `{doc[5]._.llama_vec.shape}`""")

Average of sentence-piece embeddings for: `hi`
at doc index: `9`
is of type: `<class 'torch.Tensor'>`
of shape: `torch.Size([4096])`


In [13]:
# extract hidden states from result
hidden = lc.extract_embeddings(result[:-1],kind="hidden")
hidden[0].squeeze()

tensor([[ 0.0018, -0.0038,  0.0010,  ..., -0.0090,  0.0027, -0.0038],
        [ 0.0277, -0.0060,  0.0035,  ...,  0.0005, -0.0082,  0.0064],
        [-0.0299, -0.0148, -0.0530,  ...,  0.0047, -0.0173, -0.0120],
        ...,
        [ 0.0133,  0.0078,  0.0081,  ...,  0.0051,  0.0067,  0.0248],
        [ 0.0085,  0.0002,  0.0320,  ...,  0.0081, -0.0081, -0.0270],
        [ 0.0003,  0.0029, -0.0126,  ...,  0.0014, -0.0019, -0.0064]],
       dtype=torch.float16, grad_fn=<SqueezeBackward0>)

In [14]:
# extract attentions from result
attentions = lc.extract_embeddings(result,kind="attention")
attentions[0].squeeze()[0]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [6.6260e-01, 3.3716e-01, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.4248e-01, 1.7432e-01, 2.8296e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.1169e-02, 5.6343e-03, 1.1162e-02,  ..., 1.1650e-02, 0.0000e+00,
         0.0000e+00],
        [6.0749e-04, 1.2960e-03, 4.7326e-04,  ..., 2.6436e-03, 2.0447e-01,
         0.0000e+00],
        [2.2984e-03, 1.6270e-03, 1.9798e-03,  ..., 6.3515e-03, 8.8013e-02,
         1.8066e-01]], dtype=torch.float16, grad_fn=<SelectBackward0>)

In [15]:
# extract both from result
both = lc.extract_embeddings(result,kind="both")
both[0][0]

tensor([[[ 1.8387e-03, -3.8147e-03,  9.6130e-04,  ..., -9.0332e-03,
           2.6550e-03, -3.7537e-03],
         [ 2.7710e-02, -6.0425e-03,  3.4943e-03,  ...,  5.1117e-04,
          -8.1787e-03,  6.4087e-03],
         [-2.9907e-02, -1.4771e-02, -5.2979e-02,  ...,  4.7302e-03,
          -1.7334e-02, -1.2024e-02],
         ...,
         [-2.9907e-02, -1.4771e-02, -5.2979e-02,  ...,  4.7302e-03,
          -1.7334e-02, -1.2024e-02],
         [-2.3315e-02, -1.8677e-02,  5.8594e-03,  ...,  1.7578e-02,
          -2.0266e-05,  1.2390e-02],
         [-1.4954e-03, -3.5095e-04, -1.7624e-03,  ...,  8.8882e-04,
           1.6937e-03, -6.9427e-04]]], dtype=torch.float16,
       grad_fn=<ToCopyBackward0>)

In [None]:
# openai.api_base -> openai.base_url
# openai.proxy -> openai.proxies (docs)
# openai.InvalidRequestError -> openai.BadRequestError
# openai.Audio.transcribe() -> client.audio.transcriptions.create()
# openai.Audio.translate() -> client.audio.translations.create()
# openai.ChatCompletion.create() -> client.chat.completions.create()
# openai.Completion.create() -> client.completions.create()
# openai.Edit.create() -> client.edits.create()
# openai.Embedding.create() -> client.embeddings.create()
# openai.File.create() -> client.files.create()
# openai.File.list() -> client.files.list()
# openai.File.retrieve() -> client.files.retrieve()
# openai.File.download() -> client.files.retrieve_content()
# openai.FineTune.cancel() -> client.fine_tunes.cancel()
# openai.FineTune.list() -> client.fine_tunes.list()
# openai.FineTune.list_events() -> client.fine_tunes.list_events()
# openai.FineTune.stream_events() -> client.fine_tunes.list_events(stream=True)
# openai.FineTune.retrieve() -> client.fine_tunes.retrieve()
# openai.FineTune.delete() -> client.fine_tunes.delete()
# openai.FineTune.create() -> client.fine_tunes.create()
# openai.FineTuningJob.create() -> client.fine_tuning.jobs.create()
# openai.FineTuningJob.cancel() -> client.fine_tuning.jobs.cancel()
# openai.FineTuningJob.delete() -> client.fine_tuning.jobs.create()
# openai.FineTuningJob.retrieve() -> client.fine_tuning.jobs.retrieve()
# openai.FineTuningJob.list() -> client.fine_tuning.jobs.list()
# openai.FineTuningJob.list_events() -> client.fine_tuning.jobs.list_events()
# openai.Image.create() -> client.images.generate()
# openai.Image.create_variation() -> client.images.create_variation()
# openai.Image.create_edit() -> client.images.edit()
# openai.Model.list() -> client.models.list()
# openai.Model.delete() -> client.models.delete()
# openai.Model.retrieve() -> client.models.retrieve()
# openai.Moderation.create() -> client.moderations.create()
# openai.api_resources -> openai.resources

In [3]:
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI"),
)

[mod['id'] for mod in client.models.list().model_dump()['data'] if mod["id"].startswith("gpt-4")]

['gpt-4-0613',
 'gpt-4',
 'gpt-4-vision-preview',
 'gpt-4-1106-preview',
 'gpt-4-0314']

In [4]:
import psy_llm_chat as plc
# analgous chat class to LlamaChat but for GPT 4
# gpt-4 models
model_name = 'gpt-4-0314' #from march 14
# model_name = 'gpt-4-0613' #from june 13
# model_name = 'gpt-4' #most recent

# load chat class
gptc = plc.GptChat(client, model=model_name)


In [5]:
# chat with <<model_name>>, again input `END_THIS_NOW` to terminate the dialog and return the (List[Dict]) result
result = gptc(test_dialog)

User: If you can hear me say hi.
Assistant: As an AI text model, I cannot hear you. However, I can understand your text input and respond accordingly. So, hi!


User:  END_THIS_NOW


In [6]:
gptc.get_response(test_dialog)

User: If you can hear me say hi.
Assistant: As an AI language model, I cannot hear or speak. However, I can understand and respond to text-based messages. If you have any questions or need assistance, feel free to ask.


[{'role': 'user', 'content': 'If you can hear me say hi.'},
 {'role': 'assistant',
  'content': 'As an AI language model, I cannot hear or speak. However, I can understand and respond to text-based messages. If you have any questions or need assistance, feel free to ask.'}]