
<img src= "https://cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg"/>&nbsp;&nbsp;<font size="16"><b>AI, ML and GenAI in the Lakehouse<b></font></span>
<img style="float: left; margin: 0px 15px 15px 0px; width:30%; height: auto;" src="https://i.imgur.com/pQvJTVf.jpeg"   />   


 
  
   Name:          09-01-Machine Learning at Scale with the NYC Taxi Dataset
 
   Author:    Bennie Haelen
   Date:      7-5-2025

   Purpose:   This notebook demonstrates how to prepare data for a RAG solution
                 
      An outline of the different sections in this notebook:
        1 - Data Ingestion and Initial Exploration


In [0]:
 %pip install -qq -U llama-index pydantic wikipedia-api requests beautifulsoup4
 %pip install transformers[torch]
 
dbutils.library.restartPython()

In [0]:
%run ./9-Common-Code

# Task 1: Fetch Wikipedia articles and load them into a DataFrame
"""
To start, you need to fetch Wikipedia articles and load them into a DataFrame.

Steps:
1. Define a list of Wikipedia article titles to fetch
2. Create a function to fetch Wikipedia content
3. Use Spark to create a DataFrame with the articles
4. Ensure that each article is represented as a separate record in the DataFrame

In [0]:
## Import required libraries
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document
from llama_index.core.utils import set_global_tokenizer
from transformers import AutoTokenizer
from typing import Iterator
from pyspark.sql.functions import col, udf, length, pandas_udf, explode
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import os
import pandas as pd 
import io
import requests
import re
from datetime import datetime

In [0]:
# Configure Spark for processing
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 10)
table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.lab_wikipedia_raw_text"

## Define Wikipedia articles to fetch (you can modify this list)
WIKIPEDIA_TOPICS = [
    "Artificial_intelligence",
    "Machine_learning", 
    "Deep_learning",
    "Natural_language_processing",
    "Computer_vision",
    "Reinforcement_learning",
    "Neural_network",
    "Large_language_model",
    "Transformer_(machine_learning_model)",
    "Generative_artificial_intelligence"
]

In [0]:
def fetch_wikipedia_article(title):
    """
    Fetch Wikipedia article content using the Wikipedia API
    """
    try:
        # Use Wikipedia API to get article content
        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
        response = requests.get(url)
        
        if response.status_code == 200:
            summary_data = response.json()
            
            # Get full article content
            content_url = f"https://en.wikipedia.org/w/api.php"
            params = {
                'action': 'query',
                'format': 'json',
                'titles': title.replace('_', ' '),
                'prop': 'extracts',
                'exintro': False,
                'explaintext': True,
                'exsectionformat': 'plain'
            }
            
            content_response = requests.get(content_url, params=params)
            if content_response.status_code == 200:
                content_data = content_response.json()
                pages = content_data['query']['pages']
                page_id = list(pages.keys())[0]
                
                if page_id != '-1':  # Article exists
                    content = pages[page_id].get('extract', '')
                    return {
                        'title': title,
                        'content': content,
                        'url': f"https://en.wikipedia.org/wiki/{title}",
                        'fetch_time': datetime.now()
                    }
        
        return None
    except Exception as e:
        print(f"Error fetching {title}: {str(e)}")
        return None

In [0]:
## Fetch Wikipedia articles
print("Fetching Wikipedia articles...")
articles_data = []

for topic in WIKIPEDIA_TOPICS:
    print(f"Fetching: {topic}")
    article_data = fetch_wikipedia_article(topic)
    if article_data and article_data['content']:
        articles_data.append(article_data)
        print(f"✓ Successfully fetched {topic} ({len(article_data['content'])} characters)")
    else:
        print(f"✗ Failed to fetch {topic}")

print(f"\nSuccessfully fetched {len(articles_data)} articles")

In [0]:
## Create DataFrame from fetched articles
schema = StructType([
    StructField("article_title", StringType(), True),
    StructField("content", StringType(), True),
    StructField("url", StringType(), True),
    StructField("fetch_time", TimestampType(), True)
])

In [0]:
# Convert to pandas DataFrame first, then to Spark DataFrame
pandas_df = pd.DataFrame(articles_data)
pandas_df.columns = ['article_title', 'content', 'url', 'fetch_time']

df = spark.createDataFrame(pandas_df, schema)
df.display()

# Task 2: Extract and clean the text content and split it into manageable chunks
"""
Next, clean and split the text content into manageable chunks.

Steps:
1. Define a function to clean Wikipedia text
2. Define a function to split the text content into chunks
3. Apply the functions to create a new DataFrame with the text chunks

In [0]:
## Define a function to clean Wikipedia text
def clean_wikipedia_text(text):
    """
    Clean Wikipedia text by removing special formatting and references
    """
    if not text:
        return ""
    
    # Remove references like [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove multiple whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special Wikipedia formatting
    text = re.sub(r'={2,}.*?={2,}', '', text)  # Remove section headers
    
    # Clean up extra spaces
    text = text.strip()
    
    return text

In [0]:
## Define a function to split the text content into chunks
@pandas_udf("array<string>")
def read_as_chunk(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    # Set llama2 as tokenizer
    set_global_tokenizer(
        AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    )
    # Sentence splitter from llama_index to split on sentences
    splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)
    
    def process_batch(content_series):
        result = []
        for content in content_series:
            if content:
                # Clean the Wikipedia text
                cleaned_content = clean_wikipedia_text(content)
                
                # Create Document object for llama_index
                doc = Document(text=cleaned_content)
                
                # Split into chunks
                chunks = splitter.split_text(cleaned_content)
                
                # Filter out very short chunks
                valid_chunks = [chunk for chunk in chunks if len(chunk.strip()) > 100]
                result.append(valid_chunks)
            else:
                result.append([])
        
        return pd.Series(result)
    
    for batch in batch_iter:
        yield process_batch(batch)

## Apply the chunking function
df_chunks = (
    df.withColumn("chunks", read_as_chunk(col("content")))
    .select("article_title", "url", explode("chunks").alias("content"))
    .filter(length("content") > 100)  # Filter out very short chunks
)

df_chunks.display()
print(f"Created {df_chunks.count()} text chunks from Wikipedia articles")

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType

def get_embedding_udf(batch_size=10):
    import mlflow.deployments
    deploy_client = mlflow.deployments.get_deploy_client("databricks")
    
    def embed(text):
        try:
            if not text or not str(text).strip():
                return [0.0] * 1024
            
            response = deploy_client.predict(
                endpoint="databricks-bge-large-en",
                inputs={"input": [str(text)]}
            )
            return response.data[0]['embedding']
        except Exception as e:
            print(f"Error embedding text: {e}")
            return [0.0] * 1024
    
    return udf(embed, ArrayType(FloatType()))


In [0]:
print("Computing embeddings for text chunks...")
df_chunk_emd = df_chunks.withColumn("embedding", get_embedding_udf()(col("content")))
df_chunk_emd.display()
print(f"Computed embeddings for {df_chunk_emd.count()} text chunks")

Finally, create a Delta table to store the computed embeddings.

Steps:
1. Create the Delta table schema
2. Save the DataFrame containing the computed embeddings as a Delta table

In [0]:
# Create the Delta table
embedding_table_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.lab_wikipedia_text_embeddings"

# SQL command to create the table
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {embedding_table_name} (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  article_title STRING,
  url STRING,
  content STRING,
  embedding ARRAY<FLOAT>
) TBLPROPERTIES (delta.enableChangeDataFeed = true)
"""

spark.sql(create_table_sql)
print(f"Created Delta table: {embedding_table_name}")

## Save the DataFrame as a Delta table
df_chunk_emd.write.mode("append").saveAsTable(embedding_table_name)
print(f"Saved {df_chunk_emd.count()} records to Delta table")

## Verify the data was saved correctly
verification_df = spark.sql(f"SELECT COUNT(*) as total_records FROM {embedding_table_name}")
verification_df.display()


In [0]:
%sql
SELECT * FROM book_ai_ml_lakehouse.rag.lab_wikipedia_text_embeddings