<a href="https://colab.research.google.com/github/acdc-digital/acdc.cooksite/blob/master/colab_files/solomon_chat_v3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### a. import dependencies & statements

In [None]:
! pip install "deeplake[enterprise]"
! pip install llama-index
! pip install langchain
! pip install nltk
! pip install openai
! pip install pandas
! pip install pdfminer
! pip install pdfminer.six
! pip install plotly
! pip install -U scikit-learn
! pip install torch
! pip install transformers
! pip install tqdm

In [None]:
# MASTER-CODEBLOCK
##################################

import os
import json
import matplotlib.pyplot as plt
import nltk
import numpy as np
import openai
import pandas as pd
import re
import torch
from collections import Counter
from deeplake.core.vectorstore import VectorStore
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.memory import ConversationTokenBufferMemory
from langchain.prompts import PromptTemplate
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake
import matplotlib
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
from pdfminer.high_level import extract_text, extract_pages
from sklearn.cluster import KMeans
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast

In [None]:
# Download necessary NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Set the max cell size for text (32,767 characters = true limit)
MAX_CELL_SIZE = 11250

# Directory containing your PDFs
pdf_directory = '/content/source_docs'

# List to store data
data = []

# Wrap the loop with tqdm for a progress bar
for pdf_file in tqdm(os.listdir(pdf_directory)):
    if pdf_file.endswith('.pdf'):
        file_path = os.path.join(pdf_directory, pdf_file)
        try:
            print(f"Processing {pdf_file}...")
            text = extract_text(file_path)

            if not text:
                print(f"Extracted text is empty for {pdf_file}")
                continue

            text_words_set = set(text.lower().split())
            filtered_words_set = text_words_set - stop_words
            filtered_text = ' '.join(filtered_words_set)

            # Basic heuristic: Assuming title is the first line and summary is the second line
            lines = text.split('\n')
            title = lines[0] if len(lines) > 0 else ''
            summary = lines[1] if len(lines) > 1 else ''

            # Metadata extraction
            file_size = os.path.getsize(file_path)
            number_of_pages = len(list(extract_pages(file_path)))

            # Filter Stopwords
            text_words = text.split()
            filtered_words = [word for word in text_words if word.lower() not in stop_words]
            filtered_text = ' '.join(filtered_words)

            # Text normalization
            text = text.lower()

            # Chunking the content
            chunks = [filtered_text[i:i+MAX_CELL_SIZE] for i in range(0, len(filtered_text), MAX_CELL_SIZE)]
            for chunk in chunks:
                data.append({
                    'filename': pdf_file,
                    'title_or_heading': title,
                    'content_summary': summary,
                    'content_chunk': chunk,
                    'file_size': file_size,
                    'number_of_pages': number_of_pages
                })

        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
            continue

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV for further analysis with escapechar
df.to_csv('/content/source_csv/source_docs.csv', index=False, escapechar='\\')

## print("Listing directory contents:")
## print(os.listdir(pdf_directory))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  0%|          | 0/5 [00:00<?, ?it/s]

Processing Contrast-Context-Scaling.pdf...


 40%|████      | 2/5 [00:04<00:07,  2.46s/it]

Processing Concepts-all-Need.pdf...


 60%|██████    | 3/5 [00:06<00:04,  2.15s/it]

Processing Prompt-You-Need.pdf...


 80%|████████  | 4/5 [00:12<00:03,  3.38s/it]

Processing AI-Evolution-Education.pdf...


100%|██████████| 5/5 [00:19<00:00,  3.94s/it]


generate the jsonl file, and store the output in the source_csv directory

In [None]:
import pandas as pd
import json
import os  # Import the os module

# Define the directory where you want to save the .jsonl file
save_directory = '/content/source_csv/'

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(save_directory, 'source_docs.csv'), escapechar='\\')

# Extract the 'content_chunk' column
content_chunks = df['content_chunk']

# Tokenize the text (you can use a more advanced tokenizer if you wish)
tokenized_chunks = [chunk.split() for chunk in content_chunks]

# Open a JSONL file to write the JSON objects
with open(os.path.join(save_directory, 'embeddings.jsonl'), 'w') as jsonl_file:
    for idx, tokens in enumerate(tokenized_chunks):
        # Create JSON object and write to JSONL file
        json_obj = {
            "model": "text-embedding-ada-002",
            "input": tokens,
            "metadata": {"row_id": idx}
        }
        jsonl_file.write(json.dumps(json_obj) + '\n')

In [None]:
import os
import subprocess

os.environ['OPENAI_API_KEY'] = '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'

completed_process = subprocess.run([
    "python", "/content/parallel_processor/api_request_parallel_processor.py",
    "--requests_filepath", "/content/source_csv/embeddings.jsonl",
    "--save_filepath", "/content/parallel_processor/output_embeddings.jsonl",
    "--request_url", "https://api.openai.com/v1/embeddings",
    "--max_requests_per_minute", "1500",
    "--max_tokens_per_minute", "6250000",
    "--token_encoding_name", "cl100k_base",
    "--max_attempts", "5",
    "--logging_level", "20"
], capture_output=True, text=True)

print("Return code:", completed_process.returncode)

Return code: 0
Have 0 bytes in stdout:

Have 843 bytes in stderr:
INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Starting request #3
INFO:root:Starting request #4
INFO:root:Starting request #5
INFO:root:Starting request #6
INFO:root:Starting request #7
INFO:root:Starting request #8
INFO:root:Starting request #9
INFO:root:Starting request #10
INFO:root:Starting request #11
INFO:root:Starting request #12
INFO:root:Starting request #13
INFO:root:Starting request #14
INFO:root:Starting request #15
INFO:root:Starting request #16
INFO:root:Starting request #17
INFO:root:Starting request #18
INFO:root:Starting request #19
INFO:root:Starting request #20
INFO:root:Starting request #21
INFO:root:Starting request #22
INFO:root:Starting request #23
INFO:root:Parallel processing complete. Results saved to /content/parallel_processor/output_embeddings.jsonl



In [None]:
from google.colab import files
files.download('/content/parallel_processor/output_embeddings.jsonl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import json

# Initialize an empty list to store the JSON objects
json_list = []

# Read the JSONL file line by line
with open('/content/parallel_processor/output_embeddings.jsonl', 'r') as f:
    for line in f:
        json_obj = json.loads(line.strip())
        json_list.append(json_obj)

# Convert the list of JSON objects to a DataFrame
df_embeddings = pd.json_normalize(json_list)

# Extract the actual numerical embeddings and row_ids
df_embeddings['actual_embedding'] = df_embeddings.apply(lambda row: row[1]['data'][0]['embedding'], axis=1)
df_embeddings['row_id'] = df_embeddings.apply(lambda row: row[2]['row_id'], axis=1)

# Drop unnecessary columns
df_embeddings = df_embeddings[['row_id', 'actual_embedding']]

# DF is your existing DataFrame
df = pd.read_csv('/content/source_csv/source_docs.csv')

# Merge the existing DataFrame with the embeddings DataFrame based on row_id
df_merged = pd.merge(df, df_embeddings, left_index=True, right_on='row_id', how='left')

print(df_merged.head())

# Save the merged DataFrame to a new CSV file
df_merged.to_csv('/content/source_ada/source_ada.csv', index=False)
print(df_merged.columns)

                        filename title_or_heading content_summary  \
3   Contrast-Context-Scaling.pdf                3               2   
11  Contrast-Context-Scaling.pdf                3               2   
13  Contrast-Context-Scaling.pdf                3               2   
2   Contrast-Context-Scaling.pdf                3               2   
20  Contrast-Context-Scaling.pdf                3               2   

                                        content_chunk  file_size  \
3   3 2 0 2 l u J 6 ] L C . c [ 1 v 0 7 1 3 0 . 7 ...     819956   
11  level data loading pipeline minor self-attenti...     819956   
13  text length. FOT demonstrates high accuracy ev...     819956   
2   rplexity PG19 References Joshua Ainslie, Tao L...     819956   
20  Katherine Lee, Sharan Narang, Michael Matena, ...     819956   

    number_of_pages  row_id                                   actual_embedding  
3                27       0  [0.00064371835, 0.00905348, 0.004182098, -0.02...  
11            

In [None]:
# MASTER-CODEBLOCK
##################################

os.environ['ACTIVELOOP_TOKEN'] = 'eyJhbGciOiJIUzUxMiIsImlhdCI6MTY5MDIwMDcxNCwiZXhwIjoxNzA0MDI4MjU5fQ.eyJpZCI6ImFjZGNkaWdpdGFsIn0.RwLAU6QDB2GrMGyu2XImbHajwsEpb6PMDe_IGQ8pzE4tEKCQHXUZCAdry4f9KUtt2eHktNpxBq7XI6AkDA9Mnw'
# Load DataFrame from CSV
df = pd.read_csv('/content/source_ada/source_ada.csv')

# Prepare data
chunked_text = df['content_chunk'].tolist()
source_texts = df['filename'].tolist()
precomputed_embeddings = df['actual_embedding'].apply(eval).tolist()  # Assuming embeddings are stored as strings

# Initialize Vector Store with the Hub URL
vector_store_path = "hub://solomon/solov3-enginetest1"
vector_store = VectorStore(
    path=vector_store_path,
)

# Add data to Vector Store
vector_store.add(
    text=chunked_text,
    embedding=precomputed_embeddings,
    metadata=[{"source": source_text} for source_text in source_texts]
)

Your Deep Lake dataset has been successfully created!


100%|██████████| 24/24 [00:01<00:00, 23.71it/s]
/

Dataset(path='hub://solomon/solov3-enginetest1', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (24, 1)      str     None   
 metadata     json      (24, 1)      str     None   
 embedding  embedding  (24, 1536)  float32   None   
    id        text      (24, 1)      str     None   


 

In [None]:
print(os.environ.get('OPENAI_API_KEY'))


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


In [None]:
# MASTER-CODEBLOCK
##################################

# Initialize OpenAI
import os
import openai
import openai
openai.api_key = "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"

# Your embedding function
def embedding_function(texts, model="text-embedding-ada-002"):
    if isinstance(texts, str):
        texts = [texts]
    texts = [t.replace("\n", " ") for t in texts]
    return [data['embedding'] for data in openai.Embedding.create(input=texts, model=model)['data']]

# Wrap your function in a class with an embed_query method
class MyEmbeddingFunction:
    def __init__(self, func):
        self.func = func

    def embed_query(self, query):
        return self.func(query)

# Initialize DeepLake database with the embedding_function
embedding_function_obj = MyEmbeddingFunction(embedding_function)
db = DeepLake(dataset_path="hub://solomon/solov3-enginetest1", embedding=embedding_function_obj, read_only=False)

# Initialize Retriever with parameters
retriever = db.as_retriever()
retriever.search_kwargs.update({
    'distance_metric': 'cos',
    'k': 1
})

# Define the PromptTemplate
template = """You are Solomon, a specialized personal assistant. Your expertise spans all areas of life, including technical documents and complex arguments. Use the following pieces of retrieved context to answer any questions that come up. If you don't know the answer, just say that you don't know.
{context}
Question: {question}
Helpful Answer:
"""

# Create a PromptTemplate object
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Initialize LLM for QA
model = ChatOpenAI(model='gpt-4')

# Initialize Langchain Memory with Token Buffer
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
os.environ["LANGCHAIN_API_KEY"] = "ls__fbfe7701decf42138ac5d036eb60afc5"
os.environ["LANGCHAIN_PROJECT"] = "solomon.v2.2"

memory = ConversationTokenBufferMemory(  # <-- Changed to ConversationTokenBufferMemory
    llm=model,
    max_token_limit=450,
    memory_key="chat_history",
    return_messages=True
)

# Initialize Conversational Retrieval Chain with Memory
qa = ConversationalRetrievalChain.from_llm(
    llm=model,
    retriever=retriever,
    memory=memory
)

# Define your search query
search_query = 'How is AI being used in the evolution of education?'

# Count the number of tokens in the search query and prompt
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
search_query_tokens = tokenizer.encode(search_query, truncation=True)
prompt_tokens = tokenizer.encode(template, truncation=True)
num_search_query_tokens = len(search_query_tokens)
num_prompt_tokens = len(prompt_tokens)

# Run the QA model with top-k documents
result = qa({"question": search_query})
response = result['answer']
print("QA Response:")
print(response)

# Count the number of tokens in the generated response
response_tokens = tokenizer.encode(response, truncation=True)
num_response_tokens = len(response_tokens)

# Print token counts
print(f"Number of tokens in the search query: {num_search_query_tokens}")
print(f"Number of tokens in the prompt: {num_prompt_tokens}")
print(f"Number of tokens in the generated response: {num_response_tokens}")

# Extract and print unique sources (Top 3)
print("\nUnique Sources:")
docs = retriever.get_relevant_documents(search_query)
unique_sources = set(doc.metadata.get('source', 'N/A') for doc in docs)
unique_sources_top3 = list(unique_sources)[:3]
print(unique_sources_top3)

Deep Lake Dataset in hub://solomon/solov3-enginetest1 already exists, loading from the storage
QA Response:
I'm sorry, but the provided information does not contain details on how AI is being used in the evolution of education.
Number of tokens in the search query: 11
Number of tokens in the prompt: 71
Number of tokens in the generated response: 24

Unique Sources:
['Prompt-You-Need.pdf']
