##Generative AI Assignment: 
Extract text from a PDF file stored in Azure Blob Storage and generate embeddings using an open-source LLM in Azure Databricks environment.


In [0]:
%pip install PyPDF2 sentence-transformers

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
#dbutils.fs.unmount("/mnt/gen-ai-container/")

In [0]:
#Step 1: Mount Azure Blob Storage


# Define the storage account configurations
storage_account_name = "genaicertificationsa"
container_name = "gen-ai-container"
mount_point = "/mnt/gen-ai-container"

# Configure the storage account access
configs = {
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": "j1wgCYCPWdinRWy/4OAXB1NrdROS3lLxoY/OPTJvgVi5tNwsM45Y8JkXZbBZEOe3ThfmI7F1XPDY+ASt31+r5w=="
}

# Mount the storage
dbutils.fs.mount(
    source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs
)

True

In [0]:
# List the mounted directory to verify

dbutils.fs.mounts()


[MountInfo(mountPoint='/databricks-datasets', source='databricks-datasets', encryptionType=''),
 MountInfo(mountPoint='/Volumes', source='UnityCatalogVolumes', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-tracking', source='databricks/mlflow-tracking', encryptionType=''),
 MountInfo(mountPoint='/mnt/gen-ai-container', source='wasbs://gen-ai-container@genaicertificationsa.blob.core.windows.net/', encryptionType=''),
 MountInfo(mountPoint='/databricks-results', source='databricks-results', encryptionType=''),
 MountInfo(mountPoint='/databricks/mlflow-registry', source='databricks/mlflow-registry', encryptionType=''),
 MountInfo(mountPoint='/Volume', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/volumes', source='DbfsReserved', encryptionType=''),
 MountInfo(mountPoint='/', source='DatabricksRoot', encryptionType=''),
 MountInfo(mountPoint='/volume', source='DbfsReserved', encryptionType='')]

In [0]:
#2. Text Extraction with Validation

import PyPDF2
import os

def extract_text_from_pdf(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDF file not found at path: {file_path}")
        
    try:
        text_content = ""
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text_content += page.extract_text()
                
        if not text_content.strip():
            raise ValueError("No text content extracted from PDF")
            
        print(f"Successfully extracted {len(text_content)} characters")
        return text_content
        
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

In [0]:
#3. Embedding Generation with Validation

from sentence_transformers import SentenceTransformer
import numpy as np

def generate_embeddings(text):
    if not isinstance(text, str):
        raise ValueError("Input must be a string")
    
    if not text.strip():
        raise ValueError("Input text is empty")
        
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(text)
        
        if not isinstance(embeddings, np.ndarray):
            raise ValueError("Failed to generate embeddings")
            
        print(f"Successfully generated embeddings with shape: {embeddings.shape}")
        return embeddings
        
    except Exception as e:
        raise Exception(f"Error generating embeddings: {str(e)}")

In [0]:
# validate the mount

def validate_mount():
    mount_point = "/mnt/gen-ai-container"
    try:
        files = dbutils.fs.ls(mount_point)
        print(f"Mount validation successful. Found {len(files)} files/directories")
        return True
    except Exception as e:
        print(f"Mount validation failed: {e}")
        return False

In [0]:
#4. Main Processing Function

def process_pdf_and_generate_embeddings():
    # Validate mount first
    if not validate_mount():
        return None, None
    
    try:
        # Specify PDF path
        pdf_path = "/dbfs/mnt/gen-ai-container/gen-ai-sample.pdf"
        
        # Extract text with validation
        text = extract_text_from_pdf(pdf_path)
        print(f"Text extraction successful: {len(text)} characters")
        
        # Generate embeddings with validation
        embeddings = generate_embeddings(text)
        print(f"Embedding generation successful: {embeddings.shape}")
        
        return text, embeddings
        
    except Exception as e:
        print(f"Error in processing: {str(e)}")
        return None, None

In [0]:
#5. Main execution
try:
    text, embeddings = process_pdf_and_generate_embeddings()
    
    if text is not None and embeddings is not None:
        print("\nValidation Results:")
        print(f"- Text length: {len(text)} characters")
        print(f"- First 100 characters: {text[:100]}...")
        print(f"- Embedding shape: {embeddings.shape}")
        print(f"- First 5 embedding values: {embeddings[:5]}")
    else:
        print("Processing failed. Please check the error messages above.")
        
except Exception as e:
    print(f"Execution failed: {str(e)}")

Mount validation successful. Found 1 files/directories
Successfully extracted 95387 characters
Text extraction successful: 95387 characters
Successfully generated embeddings with shape: (384,)
Embedding generation successful: (384,)

Validation Results:
- Text length: 95387 characters
- First 100 characters: See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate....
- Embedding shape: (384,)
- First 5 embedding values: [-0.07057493  0.02643592 -0.05840965  0.04737046  0.05478492]
