# **STEP 3 : VECTORISER**

**Text Vectorization**
Using Sentence-BERT:

A popular choice for generating embeddings due to its efficiency and accuracy in capturing semantic meaning.
Converts text chunks into dense vectors that can be used for semantic search and other NLP tasks.


- The script reads the markdown file and converts it into plain text.
Recursive Chunking:

- The recursive_split function splits the text using multiple separators to create manageable and semantically meaningful chunks.
This method ensures chunks are of appropriate size while maintaining semantic context.

- Each chunk is vectorized using the Sentence-BERT model to create embeddings.

- The chunks and their embeddings are prepared and sent to Elasticsearch for bulk indexing.

In [4]:
import os
import requests
import markdown
from sentence_transformers import SentenceTransformer
import json
import re
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from openai import OpenAI

# Load environment variables
load_dotenv()

# Elasticsearch configurations
es_client = Elasticsearch(
    "https://371e52c2ddc94eeda8d2dbeb8acc5645.us-central1.gcp.cloud.es.io:443",
    api_key=os.environ["elastic_host"]
)

# OpenAI configuration
openai_client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)

# Path to markdown file
markdown_file_path = 'parsed_result_gpt.md'

# Read the markdown file
with open(markdown_file_path, 'r', encoding='utf-8') as file:
    markdown_text = file.read()

# Convert markdown to plain text
plain_text = markdown.markdown(markdown_text)

# Initialize the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Recursive function for splitting text
def recursive_split(text: str, separators: list, chunk_size: int, overlap: int) -> list:
    if not separators:
        return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    chunks = re.split(separators[0], text)
    if len(chunks) == 1:
        return recursive_split(text, separators[1:], chunk_size, overlap)
    
    final_chunks = []
    for chunk in chunks:
        sub_chunks = recursive_split(chunk, separators[1:], chunk_size, overlap)
        final_chunks.extend(sub_chunks)
    return final_chunks

# Define chunking parameters
separators = ['\n\n', '\n', r'(?<=\.)\s', ' ']
chunk_size = 256
chunk_overlap = 50

# Chunk the text using recursive splitting
chunks = recursive_split(plain_text, separators, chunk_size, chunk_overlap)

# Generate embeddings for each chunk
embeddings = [model.encode(chunk).tolist() for chunk in chunks]

# Prepare documents for Elasticsearch bulk indexing
actions = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    action = {
        "_index": "documents",
        "_id": i + 1,
        "_source": {
            "content": chunk,
            "embedding": embedding,
            "metadata": {
                "author": "Author Name",
                "date": "2024-07-21"
            }
        }
    }
    actions.append(action)

# Bulk indexing in Elasticsearch
bulk_url = f'https://371e52c2ddc94eeda8d2dbeb8acc5645.us-central1.gcp.cloud.es.io:443/_bulk'
headers = {"Content-Type": "application/x-ndjson"}
bulk_data = '\n'.join(json.dumps(action) for action in actions) + '\n'

response = requests.post(bulk_url, headers=headers, data=bulk_data, auth=(os.environ["elastic_host"], ''))

# Check the response from Elasticsearch
if response.status_code in [200, 201]:
    print("Documents indexed successfully.")
else:
    print(f"Indexing error: {response.status_code}\n{response.text}")


SSLError: HTTPSConnectionPool(host='371e52c2ddc94eeda8d2dbeb8acc5645.us-central1.gcp.cloud.es.io', port=443): Max retries exceeded with url: /_bulk (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2406)')))