In [1]:
!pip install -U langchain-dartmouth > /dev/null
!pip install faiss-cpu > /dev/null

from langchain_dartmouth.llms import ChatDartmouth, DartmouthLLM
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_dartmouth.embeddings import DartmouthEmbeddings

from langchain.docstore.document import Document
from langchain_dartmouth.retrievers.document_compressors import DartmouthReranker

from langchain_community.vectorstores import FAISS
import faiss
import json

from sentence_transformers import SentenceTransformer

import os
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
os.environ["DARTMOUTH_API_KEY"] = "MbJN6Jit4xtqt466MZOyiKQVYaILO2WJqhUe9xnAs0vffS70hc0GH6Q8EkSimrJMvoXAmclDm3kNOSj0lcpHJIVsywulrmAg1bDfem3bNjqS1fG8GKgwYYPq6ZMIg0RvoD2YAOL9Nmx56rFTE4N3kK7ZmT4qo5SOufcvMzMy9B32lH4JpPsSYEgB0ywTUGN5iOYDukf5dWZnEyxpfl63RePT4cv3VpZbNDBkD4EDl79zkvi0NQUJxOa4szFV7hqEmIfYdP9615o9Eyi7PVvA330f9zSP2oBtDRnmYLInZ48OLgCjJkS3XyO2ypk5cvtT1JYXSHlawmXCy7eBfCxH8l3e"

In [4]:
def format_prompt(prompt):
    """
    This function will format a prompt into what is needed for LangChain to produce ChatML.
    Args:
       prompt: the text to be embedded as human prompt.
    """
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ]
    return messages

In [5]:
max_new_tokens = 1024
top_p = 0.95 # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature = .95 #  Strictly positive float value used to modulate the logits distribution. A value smaller than 1 decreases randomness (and vice versa), with 0 being equivalent to shifting all probability mass to the most likely token
repetition_penalty = None

system_prompt = "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity."
kwargs = dict()

llm_chat = ChatDartmouth(model_name="llama-3-1-8b-instruct",
                    temperature = temperature,
                    top_p = top_p,
                    max_tokens = max_new_tokens,
                    model_kwargs=kwargs)

In [6]:
# download LayupList data
!wget -O /content/old_reviews.json https://raw.githubusercontent.com/jeddobson/ENGL64.05-22F/refs/heads/main/data/LayupList/old_reviews.json

--2024-11-15 19:43:24--  https://raw.githubusercontent.com/jeddobson/ENGL64.05-22F/refs/heads/main/data/LayupList/old_reviews.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18938407 (18M) [text/plain]
Saving to: ‘/content/old_reviews.json’


2024-11-15 19:43:24 (194 MB/s) - ‘/content/old_reviews.json’ saved [18938407/18938407]



In [7]:
# open older format reviews and extract comments
# reviews = json.loads(open("old_reviews.json").read())
reviews = json.loads(open("/content/sample_data/policy.json").read())
reviews_text= [r["comments"]["oldReview"] for r in reviews if 'comments' in r]
# how many did we find?
print("found {0} reviews".format(len(reviews_text)))

found 0 reviews


In [8]:
# load a smaller embedding model that will quickly embed all our documents
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# Create document embeddings with embedding model
doc_embeddings = embedding_model.encode(reviews_text)

In [10]:
# display number of documents and embedding width
doc_embeddings.shape

(0,)

In [11]:
# Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(np.array(doc_embeddings))

# this will retrieve five closest neighbors using document similarity
def retrieve_documents(query, k=5):
    query_embedding = embedding_model.encode([query])[0]
    distances, indices = index.search(np.array([query_embedding]), k)
    return [reviews_text[i] for i in indices[0]]

IndexError: tuple index out of range

In [None]:
query = "I am interested in the very best courses in Chemistry Department. What are the really good courses taught by awesome professors?"

# Retrieve relevant documents for our query
retrieved_docs = retrieve_documents(query,k=10)

# Join query with context.
context = query + "\n" + "\n".join(retrieved_docs)

In [None]:
# Generate!
output = llm_chat.invoke(format_prompt(context))
print(output.content)

In [None]:
# Display context sent to LLM
context

In [29]:
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain_dartmouth.llms import ChatDartmouth

# Set parameters for the language model
max_new_tokens = 1024
top_p = 0.95
temperature = 0.95
repetition_penalty = None

# System prompt for the language model
system_prompt = "Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity."

# Initialize the language model
llm_chat = ChatDartmouth(
    model_name="llama-3-1-8b-instruct",
    temperature=temperature,
    top_p=top_p,
    max_tokens=max_new_tokens,
)

# Load the policies from the JSON file (adjust the path as needed)
with open("/content/sample_data/policy.json") as f:
    reviews = json.load(f)

# Collect all relevant text from the JSON structure
reviews_text = []
for goal in reviews:
    # Append goal description
    reviews_text.append(goal["description"])
    # Collect major activities text
    for activity in goal["majorActivities"]:
        reviews_text.append(activity["description"])
        reviews_text.extend(activity["activities"])  # Add all activities

# Count the total number of loaded reviews
print("Found {0} reviews".format(len(reviews_text)))

# Load a smaller embedding model for document embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create document embeddings
doc_embeddings = embedding_model.encode(reviews_text)

# Build a FAISS index with the document embeddings
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(np.array(doc_embeddings))

# Function to retrieve documents based on a query
def retrieve_documents(query, k=5):
    query_embedding = embedding_model.encode([query])[0]
    distances, indices = index.search(np.array([query_embedding]), k)
    return [reviews_text[i] for i in indices[0]]

# Example query
query = "what is massachusetts doing in regard to outdoor education"

# Retrieve relevant documents for the query
retrieved_docs = retrieve_documents(query, k=10)

# Combine query with the context from the retrieved documents
context = query + "\n" + "\n".join(retrieved_docs)

# Function to format the context prompt for the model
def format_prompt(context):
    return f"{system_prompt}\n{context}"

# Generate output from the language model
output = llm_chat.invoke(format_prompt(context))
print(output.content)

Found 56 reviews
Massachusetts is actively implementing various initiatives to enhance outdoor education, in line with the provided goals, focusing on equity, inclusion, and student success. Here are some key developments:

1.  **Outdoor Education Initiative**: The Massachusetts Department of Elementary and Secondary Education (DESE) has launched an Outdoor Education Initiative, which aims to increase access to outdoor education for all students. This initiative includes providing funding for outdoor education programs, promoting partnerships between schools and outdoor education organizations, and developing a statewide outdoor education framework.

2.  **STEM Education and Outdoor Learning**: Massachusetts is also emphasizing the connection between STEM education and outdoor learning. The state's STEM Education Plan includes outdoor learning as a key component, recognizing the value of hands-on, experiential learning in nature for student engagement and academic achievement.

3.  **M