In [0]:
# load articles
result = spark.sql("SELECT * FROM hackathon_schema.articles")

df = result.toPandas()
print(df.head(5))

                                                 url  ...                                          sentiment
0  https://www.msn.com/en-us/news/technology/tesl...  ...  {"layoffs": null, "restructuring": null, "boar...
1  https://www.wired.com/story/zhidou-rainbow-ev-...  ...  {"layoffs": null, "restructuring": 0.2, "board...
2  https://www.forbes.com/sites/brookecrothers/20...  ...  {"layoffs": null, "restructuring": null, "boar...
3  https://www.msn.com/en-us/autos/news/tesla-is-...  ...  {"layoffs": null, "restructuring": null, "boar...
4  https://www.msn.com/en-us/news/technology/tesl...  ...  {"layoffs": null, "restructuring": null, "boar...

[5 rows x 5 columns]


In [0]:
%pip install databricks-genai-inference langchain databricks-vectorsearch mlflow transformers
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting mlflow
  Downloading mlflow-2.12.1-py3-none-any.whl (20.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.2/20.2 MB 44.2 MB/s eta 0:00:00
Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.0/9.0 MB 83.3 MB/s eta 0:00:00
Collecting markdown<4,>=3.3
  Downloading Markdown-3.6-py3-none-any.whl (105 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.4/105.4 kB 20.3 MB/s eta 0:00:00
Collecting Flask<4
  Downloading flask-3.0.3-py3-none-any.whl (101 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 101.7/101.7 kB 23.2 MB/s eta 0:00:00
Collecting querystring-parser<2
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting graphene<4
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 128.2/128.2 kB 20.4 MB/s eta 0:0

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hackathon_schema.source_table (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  content STRING
) TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true'); 

In [0]:
# verify
result = spark.sql("SELECT * FROM hackathon_schema.source_table")
result.show()

+---+-------+
| id|content|
+---+-------+
+---+-------+



Splitting docs

In [0]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_dataframe_into_chunks(df, chunk_size=1000, chunk_overlap=200):
    """
    Splits the text in each row of the DataFrame into chunks, keeping the URL and date as metadata.

    Parameters:
    - df (pandas.DataFrame): The DataFrame with 'url', 'date', and 'content' columns.
    - chunk_size (int): The maximum size of each chunk.
    - chunk_overlap (int): The overlap between chunks.

    Returns:
    - pandas.DataFrame: A new DataFrame with columns 'content' and 'metadata' where metadata is a dictionary containing the 'url' and 'date'.
    """

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunk_list = []

    for _, row in df.iterrows():
        url = row['url']
        date = row['published_date']
        content = row['content']
        chunks = text_splitter.create_documents([content], [{"url": url, "published_date": date}])
        for chunk in chunks:
            chunk_dict = {"content": chunk.page_content, "metadata": chunk.metadata}
            chunk_list.append(chunk_dict)

    return pd.DataFrame(chunk_list)

# Apply the function
chunked_df = split_dataframe_into_chunks(df)

# Display the result
print(chunked_df.head())

                                             content                                           metadata
0  Tesla's 'apocalypse-proof' Cybertruck thwarted...  {'url': 'https://www.msn.com/en-us/news/techno...
1  As Elon Musk Abandons the $25K Tesla, This EV ...  {'url': 'https://www.wired.com/story/zhidou-ra...
2  priced from 31,900 yuan before subsidiesthat's...  {'url': 'https://www.wired.com/story/zhidou-ra...
3  Rainbow is fitted with a 30-kW 40-horsepower m...  {'url': 'https://www.wired.com/story/zhidou-ra...
4  millimeters 64.2 inches tall. This is, of cour...  {'url': 'https://www.wired.com/story/zhidou-ra...


In [0]:
# Combine content and metadata into one column as a dict
chunked_df['content'] = chunked_df.apply(lambda row: {"content": row['content'], "metadata": {"url": row['url'], "published_date": row['published_date']}}, axis=1)
chunked_df.drop(columns=['metadata'], inplace=True)
print(chunked_df['content'])

0     {'content': 'Tesla's 'apocalypse-proof' Cybert...
1     {'content': 'As Elon Musk Abandons the $25K Te...
2     {'content': 'Longer-Range Tesla Model Y Debuts...
3     {'content': 'Tesla is facing major competition...
4     {'content': 'Tesla's Optimus video fub is lead...
5     {'content': 'Tesla plans to charge some Model ...
6     {'content': 'Hyundai antes up $1B for AV start...
7     {'content': 'These are the kinds of San Franci...
8     {'content': 'Musk just slashed Tesla's Superch...
9     {'content': 'Is Musk Crazy Smart for Axing Tes...
10    {'content': 'Can Elon Musk's Tesla keep stradd...
11    {'content': ''I'm Still in Denial': Tesla Layo...
12    {'content': 'Have the wheels come off for Tesl...
13    {'content': '$299 Tesla Model 3 'Too Good' Lea...
14    {'content': 'Tesla pushes to legalise driverle...
15    {'content': 'Tesla's Supercharger layoffs coul...
16    {'content': 'What's happening at Tesla? Here's...
17    {'content': 'How U.S. safety regulators ha

In [0]:
# push to the source table
spark_df = spark.createDataFrame(chunked_df)

# Write data to the articles table
spark_df.write.format("delta").mode("append").saveAsTable("hackathon_schema.source_table")

# Verify the insertion
result = spark.sql("SELECT * FROM hackathon_schema.source_table LIMIT 10")
result.show()

+---+--------------------+
| id|             content|
+---+--------------------+
|  1|Tesla's 'apocalyp...|
|  2|As Elon Musk Aban...|
|  3|priced from 31,90...|
|  4|Rainbow is fitted...|
|  5|millimeters 64.2 ...|
|  6|the 60s, their ti...|
|  7|revealed by Formu...|
|  8|and anything else...|
|  9|effort led by Gee...|
| 10|attempts to build...|
+---+--------------------+



Create Vector Index

In [0]:
# create vs endpoint
from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient()

VECTOR_SEARCH_ENDPOINT_NAME = "hackathon_vs_endpoint"
vsc.create_endpoint(name=VECTOR_SEARCH_ENDPOINT_NAME, endpoint_type="STANDARD")
print(f"Endpoint named {VECTOR_SEARCH_ENDPOINT_NAME} is ready.")

[NOTICE] Using a notebook authentication token. Recommended for development only. For improved performance, please use Service Principal based authentication. To disable this message, pass disable_notice=True to VectorSearchClient().
Endpoint named hackathon_vs_endpoint is ready.


In [0]:
# create index
index = vsc.create_delta_sync_index(
    endpoint_name="hackathon_vs_endpoint",
    index_name="my_test_workspace.hackathon_schema.vs_index",
    source_table_name="my_test_workspace.hackathon_schema.source_table",
    pipeline_type="CONTINUOUS",
    primary_key="id",
    embedding_source_column='content',
    embedding_model_endpoint_name="databricks-bge-large-en"
)

In [0]:
# retrieve the index
index = vsc.get_index(
    endpoint_name="hackathon_vs_endpoint",
    index_name="my_test_workspace.hackathon_schema.vs_index"
)

In [0]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatDatabricks
from databricks.vector_search.client import VectorSearchClient
from langchain_community.vectorstores import DatabricksVectorSearch
from langchain_community.embeddings import DatabricksEmbeddings

embedding_model = DatabricksEmbeddings(endpoint="databricks-bge-large-en")

# Create the retriever
vectorstore = DatabricksVectorSearch(index, text_column="content", embedding=embedding_model).as_retriever()

chat_model = ChatDatabricks(endpoint="databricks-dbrx-instruct", max_tokens = 1000)

TEMPLATE = """You are a helpful assistant.
{context}
Question: {question}
Answer:
"""
prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])

chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=vectorstore,
    chain_type_kwargs={"prompt": prompt}
)

question = {"query": "What's going on with layoffs at Tesla?"}
answer = chain.run(question)
print(answer)



Tesla has announced plans to cut more than 10% of its global workforce, which would affect around 14,000 employees. This week, Tesla let go about 500 employees on its EV Supercharger team. Additionally, some offers of summer internships have been revoked. These layoffs come after Tesla reported the largest quarterly revenue drop in more than a decade and a 20% decrease in EV deliveries in the January-March period compared to the previous quarter. Despite these challenges, Tesla remains committed to reducing costs and increasing efficiency.


In [0]:
# Truncate the table
#spark.sql("TRUNCATE TABLE hackathon_schema.users")
#spark.sql("TRUNCATE TABLE hackathon_schema.articles")

#spark.sql("DROP TABLE IF EXISTS hackathon_schema.source_table")

DataFrame[]