In [1]:
pip install -qU pypdf langchain langchain_community langchain_experimental langchain_openai

Collecting vertexai
  Downloading vertexai-1.60.0-py3-none-any.whl.metadata (10 kB)
Collecting google-cloud-aiplatform==1.60.0 (from google-cloud-aiplatform[all]==1.60.0->vertexai)
  Downloading google_cloud_aiplatform-1.60.0-py2.py3-none-any.whl.metadata (31 kB)
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3 (from google-cloud-aiplatform==1.60.0->google-cloud-aiplatform[all]==1.60.0->vertexai)
  Downloading google_cloud_resource_manager-1.12.5-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting shapely<3.0.0dev (from google-cloud-aiplatform==1.60.0->google-cloud-aiplatform[all]==1.60.0->vertexai)
  Downloading shapely-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl.metadata (7.0 kB)
Collecting docstring-parser<1 (from google-cloud-aiplatform==1.60.0->google-cloud-aiplatform[all]==1.60.0->vertexai)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Downloading vertexai-1.60.0-py3-none-any.whl (7.3 kB)
Downloading google_cloud_aiplatform-1.60.0-py2.py3-none-any.whl (

### Create link to external models in BigQuery

Based off of this article

https://cloud.google.com/blog/products/data-analytics/how-to-use-rag-in-bigquery-to-bolster-llms

In [20]:
# import pandas as pd
import google.auth
from google.oauth2 import service_account
from google.cloud import bigquery

# authenticate to Google Cloud

GOOGLE_PROJECT = 'gristmill5'
credentials = service_account.Credentials.from_service_account_file("creds/gristmill5-e521e2f08f35.json")
client = bigquery.Client(GOOGLE_PROJECT, credentials)

# create link to embedding model

sql = """
CREATE OR REPLACE MODEL `gristmill5.rag_test.gecko_embedding_model`
REMOTE WITH CONNECTION `projects/gristmill5/locations/us/connections/vertex_ai`
OPTIONS (ENDPOINT = 'textembedding-gecko');
"""

client.query(sql, project=GOOGLE_PROJECT).result()

# create link to LLM

sql = """
CREATE OR REPLACE MODEL `gristmill5.rag_test.gemini_llm_model`
REMOTE WITH CONNECTION `projects/gristmill5/locations/us/connections/vertex_ai`
OPTIONS (ENDPOINT = 'gemini-1.0-pro');
"""

client.query(sql, project=GOOGLE_PROJECT).result()

# create table function that accepts a user query, finds similar chunks, and passes those chunks to the LLM

sql = """
CREATE OR REPLACE TABLE FUNCTION rag_test.rag_query(querys STRING, route_type STRING, words INT64, doc_source ARRAY <STRING>, selected_distance FLOAT64) AS (
with q_embeddings as (
  SELECT
    text_embedding,
    content
  FROM
    ML.GENERATE_TEXT_EMBEDDING(
      MODEL `rag_test.gecko_embedding_model`,
      (
        SELECT
        CAST(querys AS STRING) AS content
      )
    )
),

a_embeddings as (
  select * 
  from `rag_test.embeddings` 
  where source in UNNEST(doc_source)
  and embedding_type = FORMAT('%s', route_type)
  and FORMAT('%s', route_type) = 'summary'
),

v_search as (
  SELECT *
  FROM
    VECTOR_SEARCH( 
      (
        select * 
        from `rag_test.embeddings` 
        where source in UNNEST(doc_source)
        -- and statistics is not null
        and embedding_type = FORMAT('%s', route_type)
        and FORMAT('%s', route_type) = 'details'
      ),
      'text_embedding',
      (select * from q_embeddings where 1=1),
      top_k => 5
    )
  WHERE distance < selected_distance
)

SELECT *
FROM 
  ML.GENERATE_TEXT(
    MODEL
      `rag_test.gemini_llm_model`, 
      (
        -- query for when an answer needs to contain specific details
        SELECT
          CONCAT(FORMAT('Answer this question in less than %d words:\\n\\n %s \\n\\n', words, querys), '\\n\\n by using these text chunks: \\n\\n', STRING_AGG(base.chunk, '\\n')) AS prompt, 
          -- CONCAT(FORMAT('Summarize these text chunks in less than %d words:\\n\\n', words), STRING_AGG(base.chunk, '\\n')) AS prompt, 
          ARRAY_AGG(
            STRUCT(
              base.id as id,
              base.chunk as chunk,
              -- base.statistics as statistics, 
              base.embedding_type,
              -- base.ml_embed_text_status as status,
              distance as distance
            )
          ) source_ids
        FROM v_search

        -- query for when answer needs to be a summary
        UNION ALL SELECT 
          CONCAT(FORMAT('Summarize this text in less than %d words:\\n\\n', words), SUBSTRING(chunk, 1, 32760)) AS prompt, 
          [
            STRUCT(
              id,
              chunk,
              -- statistics, 
              embedding_type,
              -- {} as status,
              0.1 as distance
            )
          ] source_ids
        FROM a_embeddings
      ),
      STRUCT(
        0.4 AS temperature,
        300 AS max_output_tokens,
        0.5 AS top_p,
        5 AS top_k,
        TRUE AS flatten_json_output
      )
  )
)
"""

client.query(sql, project=GOOGLE_PROJECT).result()



<google.cloud.bigquery.table._EmptyRowIterator at 0x7f8af97e17f0>

### Ask Hacker News - 2024

In [6]:
sql = """
INSERT INTO TABLE `rag_test.embeddings` as
SELECT 'Ask HN' as source, *
FROM ML.GENERATE_TEXT_EMBEDDING(
  MODEL `rag_test.gecko_embedding_model`, (
    SELECT cast(id AS STRING) id, concat(title, ': ', text) as content 
    FROM `bigquery-public-data.hacker_news.full` 
    where text is not null
    and type = 'story'
    and timestamp > '2024-01-01'
    )
  )
"""

client.query(sql, project=GOOGLE_PROJECT).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f7d88bcff10>

In [None]:
import sqlite3
import pandas as pd

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)

conn = sqlite3.connect('/Users/anthonychamberas/chat.db')
# conn = sqlite3.connect('~/Library/Messages/chat.db')

messages = pd.read_sql_query('''
    select distinct h.id phone_number, chj.chat_id, m.ROWID message_id, m.text, m.attributedBody, HEX(m.attributedBody) hex_message, m.date, m.handle_id, datetime(m.date/1000000000 + strftime("%s", "2001-01-01") ,"unixepoch","localtime") as date_utc 
    from chat_handle_join chj 
    inner join chat_message_join cmj 
        on chj.chat_id = cmj.chat_id 
        -- and chj.handle_id in (7,8,9)
    inner join message m 
        on cmj.message_id = m.ROWID 
    inner join handle h 
        on chj.handle_id = h.ROWID 
''', conn)

mapping =  dict.fromkeys(range(32))

messages['hex_message'] = messages['hex_message'].apply(lambda x: bytes.fromhex(x))
messages['decoded'] = messages['hex_message'].str.decode("utf-8", "ignore")
messages['cleaned'] = messages['decoded'].str.translate(mapping)
messages['stripped'] = messages['cleaned'].str.extract(r'\+(.*)iI')

messages['reps'] = messages['stripped'].str.extract(r'(\dx\d*)')
messages['dots'] = messages['stripped'].str.extract(r'[….|…|..|…|..\s|.. ](\d*)')
messages['comma'] = messages['stripped'].str.extract(r'(\d*)[,]')

#messages[['date_utc', 'chat_id', 'handle_id','stripped', 'reps', 'dots']].to_csv('cleaned.csv')
messages[['date_utc', 'phone_number', 'chat_id', 'handle_id','stripped', 'reps', 'dots']]

In [15]:
import os
import pandas as pd
import streamlist as st
from langchain_openai.embeddings import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]

embed = OpenAIEmbeddings(model="text-embedding-3-large")

chunk = ['embed this sentence.', 'search for this sentence']
df = pd.DataFrame(chunk, columns=['chunk'])

vectors = embed.embed_documents(df['chunk'])
df['vectors'] = pd.Series(vectors).to_numpy()



from google.cloud import bigquery
from google.oauth2 import service_account

GOOGLE_PROJECT = 'gristmill5'
credentials = service_account.Credentials.from_service_account_file("creds/gristmill5-e521e2f08f35.json")
client = bigquery.Client(GOOGLE_PROJECT, credentials)

job_config = bigquery.LoadJobConfig(autodetect=True)
#table_id = bigquery.Table('table') 
#table_id = client.create_table(table, exists_ok=True)

job = client.load_table_from_dataframe(df,"gristmill5.rag_test.table_id",job_config=job_config).result()



In [168]:
from utils.connectors import *

sql = f"""
    select * 
    from `rag_test.embeddings` 
    where source in UNNEST(['Tableau Zen - Visual Analytics Maturity Assessment.docx'])
"""

data = bq_conn(sql)

query = 'How does it work?'
query = query.replace("'", "\\'")
embed = OpenAIEmbeddings(model="text-embedding-3-large")
vector = embed.embed_documents([query])

# Calculate cosine similarities between the query vector and the dataset
vectors = np.array(data['vectors'].to_list())
# similarities = cosine_similarity(vectors, vector)

# similarity_array = [s[0] for s in similarities]
# similarity_df = pd.DataFrame(similarity_array, columns=['similarity'])
similarities = pd.DataFrame([s[0] for s in cosine_similarity(vectors, vector)], columns=['similarity'])
df = pd.concat([data, similarities], axis=1)

n = 2
# top_n_idx = np.argsort(similarity_array)[-n:]
top_n_idx = np.argsort(df['similarity'])[-n:]
references = df[['source', 'page', 'chunk', 'similarity']].iloc[top_n_idx]

display(references)


Unnamed: 0,source,page,chunk,similarity
8,Tableau Zen - Visual Analytics Maturity Assess...,0,Why does it matter to me?,0.280168
4,Tableau Zen - Visual Analytics Maturity Assess...,0,How does it work?,0.999999


In [None]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [8]:
pip install ics-vtimezones

Collecting ics-vtimezones
  Downloading ics_vtimezones-2020.2-py3-none-any.whl.metadata (2.6 kB)
Collecting importlib_resources>=1.4 (from ics-vtimezones)
  Downloading importlib_resources-6.4.4-py3-none-any.whl.metadata (4.0 kB)
Downloading ics_vtimezones-2020.2-py3-none-any.whl (184 kB)
Downloading importlib_resources-6.4.4-py3-none-any.whl (35 kB)
Installing collected packages: importlib_resources, ics-vtimezones
Successfully installed ics-vtimezones-2020.2 importlib_resources-6.4.4
Note: you may need to restart the kernel to use updated packages.


In [11]:
from ics import Calendar, Event
from datetime import datetime
import pytz

# Define the timezone (e.g., for Eastern Time)
timezone = pytz.timezone('America/New_York')

# List of events with their respective dates and descriptions
events = [
    {"name": "Teachers' Professional Learning", "start": "2024-08-26", "end": "2024-08-27"},
    {"name": "First day for students (K-7, 9)", "start": "2024-08-28"},
    {"name": "First day for students (8, 10-12)", "start": "2024-08-29"},
    {"name": "Labor Day: NO SCHOOL", "start": "2024-09-02"},
    {"name": "Rosh Hashanah: NO SCHOOL", "start": "2024-10-03"},
    {"name": "Indigenous Peoples' Day: NO SCHOOL", "start": "2024-10-14"},
    {"name": "Diwali: NO SCHOOL", "start": "2024-11-01"},
    {"name": "Professional Day: NO SCHOOL", "start": "2024-11-05"},
    {"name": "Veterans Day: NO SCHOOL", "start": "2024-11-11"},
    {"name": "Thanksgiving: NO SCHOOL", "start": "2024-11-28"},
    {"name": "Schools closed; Offices closed", "start": "2024-11-29"},
    {"name": "Winter Vacation (Offices open)", "start": "2024-12-23", "end": "2025-01-01"},
    {"name": "Christmas: NO SCHOOL", "start": "2024-12-25"},
    {"name": "New Year's Day: NO SCHOOL", "start": "2025-01-01"},
    {"name": "Martin Luther King Jr. Day: NO SCHOOL", "start": "2025-01-20"},
    {"name": "Lunar New Year: NO SCHOOL", "start": "2025-01-29"},
    {"name": "Presidents' Day: NO SCHOOL", "start": "2025-02-17"},
    {"name": "February Vacation (Offices open)", "start": "2025-02-17", "end": "2025-02-21"},
    {"name": "Eid al-Fitr: NO SCHOOL", "start": "2025-03-31"},
    {"name": "Good Friday: NO SCHOOL", "start": "2025-04-18"},
    {"name": "Patriots Day: NO SCHOOL", "start": "2025-04-21"},
    {"name": "April Vacation (Offices open)", "start": "2025-04-21", "end": "2025-04-25"},
    {"name": "Memorial Day: NO SCHOOL", "start": "2025-05-26"},
    {"name": "ABRHS Graduation", "start": "2025-06-06"},
    {"name": "180th Day - Last Day/Early Release if no cancellations", "start": "2025-06-18"},
    {"name": "Juneteenth: NO SCHOOL", "start": "2025-06-19"},
    {"name": "185th Day (hold for possible cancellations)", "start": "2025-06-26"}
]

# Create a new calendar
calendar = Calendar()

# Add events to the calendar
for event in events:
    e = Event()
    e.name = event["name"]
    e.begin = timezone.localize(datetime.strptime(event["start"], "%Y-%m-%d"))
    if "end" in event:
        e.end = timezone.localize(datetime.strptime(event["end"], "%Y-%m-%d"))
    else:
        e.make_all_day()
    calendar.events.add(e)

# Add Early Dismissal events
early_dismissal_dates = [
    "2024-09-16", "2024-10-07", "2024-10-21", "2024-11-18", "2024-12-09",
    "2025-01-06", "2025-01-27", "2025-02-10", "2025-02-24", "2025-03-10",
    "2025-03-24", "2025-04-07", "2025-04-28", "2025-05-05", "2025-05-19"
]

for date in early_dismissal_dates:
    e = Event()
    e.name = "Early Dismissal"
    e.begin = timezone.localize(datetime.strptime(date + " 13:15", "%Y-%m-%d %H:%M"))
    e.end = timezone.localize(datetime.strptime(date + " 14:15", "%Y-%m-%d %H:%M"))
    calendar.events.add(e)

# Write the calendar to an .ics file
with open('2024-2025_ABRHS_Academic_Calendar.ics', 'w') as f:
    f.writelines(calendar)


In [None]:
            """
            upload_file_name = uploaded_file.name
            upload_file_path = f'docs/tmp_{upload_file_name}'
            upload_file_data = uploaded_file.read()
            upload_file_id = str(uuid.uuid4())


            if uploaded_file.type == 'text/plain':
                # save file locally
                with open(upload_file_path, 'wb') as uf:
                    uf.write(upload_file_data)

                loader = TextLoader(upload_file_path)
                loader_docs = loader.load()

                docs = []
                for d in loader_docs:
                    docs.append({'page':0, 'page_content':d.page_content})

            elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                # save file locally
                with open(upload_file_path, 'wb') as uf:
                    uf.write(upload_file_data)

                loader = Docx2txtLoader(upload_file_path)
                loader_docs = loader.load()

                docs = []
                for d in loader_docs:
                    docs.append({'page':0, 'page_content':d.page_content})

            elif uploaded_file.type == 'application/pdf':
                # save file locally
                with open(upload_file_path, 'wb') as uf:
                    uf.write(upload_file_data)

                reader = PdfReader(upload_file_path)
                extract_images = False if len(reader.pages[0].extract_text()) > 0 else True

                loader_message = "Loading document with OCR..." if extract_images else "Loading document..."
                loader = PyPDFLoader(upload_file_path, extract_images=extract_images)
                loader_docs = loader.load()

                docs = []
                for d in loader_docs:
                    docs.append({'page':d.metadata['page'], 'page_content':d.page_content})

            elif uploaded_file.type == 'application/json':
                docs = json.loads(uploaded_file.getvalue())
                df = pd.DataFrame(docs)

                st.multiselect(label='Select the keys from the JSON file that you\'d like included in the documents.', options=df.columns, key='json_keys')
                selected_columns = st.session_state['json_keys']
                df = df[selected_columns]
                df['page_content'] = df.apply(lambda x: '\n'.join([selected_columns[i]+': '+x[i] for i in range(0, len(x))]), axis=1)
                df['page'] = df.index
                df['metadata'] = df['page'].apply(lambda x: {'page':x})

                docs = df.to_dict(orient='records')
            """


                    """
                    text_splitter = SemanticChunker(
                        OpenAIEmbeddings(), 
                        breakpoint_threshold_type="percentile",
                        breakpoint_threshold_amount=0.1,
                        # number_of_chunks=30
                    )

                    splits_text = []

                    for d in docs:
                        page_content = d['page_content']
                        page = d['page']

                        splits = text_splitter.create_documents([page_content])
                        splits_text.extend([{'id': upload_file_id, 'source':upload_file_name, 'page':page, 'embedding_type':'details', 'split_type':split_type, 'chunk': d.dict()['page_content']} for d in splits])

                    df = pd.DataFrame(splits_text)
                    """

                    """
                    # create embeddings
                    embed = OpenAIEmbeddings(model="text-embedding-3-large")

                    vectors = embed.embed_documents(df['chunk'])
                    df['vectors'] = pd.Series(vectors).to_numpy()
                    df = df.reset_index()
                    """


                    """
                    # summarize document using map-reduce
                    llm = ChatOpenAI(model="gpt-4o", temperature=0)
                    
                    # Map
                    map_template = ""The following is a set of documents
                    {docs}
                    Based on this list of docs, please identify the main themes in 300 or fewer words
                    Helpful Answer:""
                    map_prompt = PromptTemplate.from_template(map_template)
                    map_chain = LLMChain(llm=llm, prompt=map_prompt)
                    summary = map_chain.run(docs)
                    """

                    """
                    # load embeddings to big query
                    GOOGLE_PROJECT = 'gristmill5'
                    credentials = service_account.Credentials.from_service_account_file("creds/gristmill5-e521e2f08f35.json")
                    client = bigquery.Client(GOOGLE_PROJECT, credentials)
                    job_config = bigquery.LoadJobConfig(autodetect=True)

                    job = client.load_table_from_dataframe(df,"gristmill5.rag_test.embeddings",job_config=job_config).result()

                    # load document contents and summary to big query
                    docs_df = pd.DataFrame([[upload_file_id, friendly_name, upload_file_name, uploaded_file.type, upload_file_data, summary]], columns=['id', 'name', 'filename', 'filetype', 'contents', 'summary'])
                    client.load_table_from_dataframe(docs_df,"gristmill5.rag_test.documents",job_config=job_config).result()
                    if os.path.exists(upload_file_path):
                        os.remove(upload_file_path)
                    """