In [None]:
!pip install torch==2.1.0
!pip install transformers==4.31.0
!pip install sentence-transformers==2.2.2
!pip install pinecone-client==2.2.2
!pip install datasets==2.14.0
!pip install accelerate==0.21.0
!pip install einops==0.6.1
!pip install langchain==0.0.240
!pip install xformers==0.0.20
!pip install bitsandbytes==0.41.0
!pip install llama-cpp-python==0.1.78
!pip install tiktoken

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninstalled tokenizers-0.15.0
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed tokenizers-0.13.3 transformers-4.31.0
Collecting sentence-transforme

In [None]:
#@title Vector Tokenizer

from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
import os
import pinecone

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or '7ee8bfdd-cc18-4921-b217-62a68803b7d4',
    environment=os.environ.get('PINECONE_ENVIRONMENT') or 'gcp-starter'
)

In [None]:
docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 384.


In [None]:
import time

index_name = 'pookie-bear'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.10241,
 'namespaces': {'': {'vector_count': 10241}},
 'total_vector_count': 10241}

In [None]:
from datasets import load_dataset

data = load_dataset(
    'Amod/mental_health_counseling_conversations',
    split='train'
)
data

Downloading readme:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})

In [None]:
import pandas as pd
data = data.to_pandas()

batch_size = 32
id_offset = 0

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i + batch_size)
    batch = data.iloc[i:i_end]

    # Assuming 'Context' and 'Response' are the new column names
    chunk_ids = [str(id_offset + i + 1) for i, _ in enumerate(batch.iterrows())]

    # Assuming 'Context' and 'Response' are the new column names
    context_texts = [x['Context'] for _, x in batch.iterrows()]
    response_texts = [x['Response'] for _, x in batch.iterrows()]

    # Assuming embed_documents can still be used for the new format
    context_embeds = embed_model.embed_documents(context_texts)
    response_embeds = embed_model.embed_documents(response_texts)

    # Reduced metadata with only 'context' and 'response'
    metadata_chunks = [
        {'context': context, 'response': response} for context, response in zip(context_texts, response_texts)
    ]

    # Assuming the Pinecone index is named 'index'
    for chunk_id, context_embed, response_embed, metadata_chunk in zip(chunk_ids, context_embeds, response_embeds, metadata_chunks):
        # Adjusted id to include chunk information, and encode to ASCII
        adjusted_id = f"{chunk_id}_{metadata_chunk['context']}_{metadata_chunk['response']}".encode('ascii', 'ignore').decode('ascii')

        # Truncate the adjusted_id to fit within the length limit
        adjusted_id = adjusted_id[:512]

        index.upsert(vectors=[(adjusted_id, context_embed, metadata_chunk)])

    id_offset += len(batch)

In [None]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.10241,
 'namespaces': {'': {'vector_count': 10241}},
 'total_vector_count': 10241}

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

Explain to me the difference between nuclear fission and fusion.

Nuclear fission is a process in which an atomic nucleus splits into two or more smaller nuclei, releasing a large amount of energy in the process. This process typically occurs when an atom is bombarded with a high-energy particle, such as a neutron. When the nucleus splits, it releases a large amount of energy in the form of kinetic energy of the fragments and gamma radiation.

Nuclear fusion, on the other hand, is the process by which two or more atomic nuclei combine to form a single, heavier nucleus. This process also releases a large amount of energy, but it does so at much higher temperatures than those required for fission. In order to achieve fusion, the atoms must be heated to incredibly high temperatures, typically over 100 million degrees Celsius.

One key difference between fission and fusion is the direction of the energy release. In fission, the energy is released outward from the nucleus, while in fusion, 

In [None]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
llm(prompt="Explain to me the difference between nuclear fission and fusion.")

'\n\nNuclear fission is a process in which an atomic nucleus splits into two or more smaller nuclei, releasing a large amount of energy in the process. This process typically occurs when an atom is bombarded with a high-energy particle, such as a neutron. When the nucleus splits, it releases a large amount of energy in the form of kinetic energy of the fragments and gamma radiation.\n\nNuclear fusion, on the other hand, is the process by which two or more atomic nuclei combine to form a single, heavier nucleus. This process also releases a large amount of energy, but it does so at much higher temperatures than those required for fission. In order to achieve fusion, the atoms must be heated to incredibly high temperatures, typically over 100 million degrees Celsius.\n\nOne key difference between fission and fusion is the direction of the energy release. In fission, the energy is released outward from the nucleus, while in fusion, the energy is released inward towards the center of the n

In [None]:
from langchain.vectorstores import Pinecone

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

from langchain.vectorstores import Pinecone

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

query = 'what makes llama 2 special?'

vectorstore.similarity_search(
    query,  # the search query
    k=3  # returns top 3 most relevant chunks of text
)

from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)



In [None]:
rag_pipeline('what is so special about llama 2?')



{'query': 'what is so special about llama 2?',
 'result': ' Llama 2 is a popular concentration game where players are expected to focus on one specific thing (such as a red dot) for a set amount of time without getting distracted by other stimuli. The game is designed to help improve concentration and self-discipline.'}

In [None]:
rag_pipeline('Ive never been able to talk with my parents. My parents are in their sixties while I am a teenager. I love both of them but not their personalities. I feel that they do not take me seriously whenever I talk about a serious event in my life. If my dad doesnt believe me, then my mom goes along with my dad and acts like she doesnt believe me either. Im a pansexual, but I cant trust my own parents. Ive fought depression and won; however, stress and anxiety are killing me.')

{'query': 'Ive never been able to talk with my parents. My parents are in their sixties while I am a teenager. I love both of them but not their personalities. I feel that they do not take me seriously whenever I talk about a serious event in my life. If my dad doesnt believe me, then my mom goes along with my dad and acts like she doesnt believe me either. Im a pansexual, but I cant trust my own parents. Ive fought depression and won; however, stress and anxiety are killing me.',
 'result': ' Sorry to hear that you\'re struggling with your parents and feeling unheard and untrusted. It\'s understandable that you feel this way, especially since you\'ve fought depression and won, but stress and anxiety are still affecting you. It\'s important to remember that you deserve to be heard and understood, and it\'s okay to communicate your needs and feelings to your parents. One suggestion is to talk to your parents during a non-emotional moment when the discussion is not already heated. Prepar