In [1]:
import io
import requests
import docx

In [2]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [6]:
faq_documents = {
    'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
}

In [7]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

llm-zoomcamp


In [12]:
type(documents)

list

In [17]:
len(documents)

1

In [18]:
documents

[{'course': 'llm-zoomcamp',
  'documents': [{'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
    'section': 'General course-related questions',
    'question': 'I just discovered the course. Can I still join?'},
   {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework (while the form is Open) without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
    'section': 'General course-related questions',
    'question': 'Course - I have registered for the [insert-zoomcamp-name]. When can I expect to receive the confirmation email?'},
   {'text': 'The zoom link is only published to instructors/presenters/TAs.\nStudents participate via Youtube Live and submit questions to Slido (link would be pinned in the chat when Alexey goes Live). The video URL should be posted in the announcements chan

In [26]:
dict_doc = documents[0]

In [27]:
import pandas as pd
df = pd.DataFrame(dict_doc)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   course     86 non-null     object
 1   documents  86 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [30]:
len(df)

86

In [32]:
import re
from typing import Any, Dict, List



def chunk_documents(data: List[Dict[str, Any]], *args, **kwargs):
    documents = []
    
    for idx, item in enumerate(data):
        course = item['course']
        
        for info in item['documents']:
            section = info['section']
            question = info['question']
            answer = info['text']
            
            # Generate a unique document ID
            document_id = ':'.join([re.sub(r'\W', '_', part) 
	            for part in [course, section, question]]).lower()
            
            # Format the document string
            chunk = '\n'.join([
                f'course:\n{course}\n',
                f'section:\n{section}\n',
                f'question:\n{question}\n',
                f'answer:\n{answer}\n',
            ])
            
            documents.append(dict(
                chunk=chunk,
                document=info,
	            document_id=document_id,
            ))

    print(f'Documents:', len(documents))
            
    return [documents]

In [33]:
doc_chunks = chunk_documents(documents)

Documents: 86


In [38]:
type(doc_chunks)

list

In [39]:
doc_chunks

[[{'chunk': 'course:\nllm-zoomcamp\n\nsection:\nGeneral course-related questions\n\nquestion:\nI just discovered the course. Can I still join?\n\nanswer:\nYes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.\n',
   'document': {'text': 'Yes, but if you want to receive a certificate, you need to submit your project while we’re still accepting submissions.',
    'section': 'General course-related questions',
    'question': 'I just discovered the course. Can I still join?'},
   'document_id': 'llm_zoomcamp:general_course_related_questions:i_just_discovered_the_course__can_i_still_join_'},
  {'chunk': "course:\nllm-zoomcamp\n\nsection:\nGeneral course-related questions\n\nquestion:\nCourse - I have registered for the [insert-zoomcamp-name]. When can I expect to receive the confirmation email?\n\nanswer:\nYou don't need it. You're accepted. You can also just start learning and submitting homework (while the form is Open) wi

In [36]:
import json
from typing import Dict, List, Tuple, Union
from datetime import datetime

import numpy as np
from elasticsearch import Elasticsearch


#@data_exporter
def elasticsearch(documents: List[Dict[str, Union[Dict, List[int], str]]], *args, **kwargs):
    connection_string = kwargs.get('connection_string', 'http://localhost:9200')
    #index_name = kwargs.get('index_name', 'documents')
    index_name_prefix = kwargs.get('index_name', 'documents')
    current_time = datetime.now().strftime("%Y%m%d_%M%S")
    index_name = f"{index_name_prefix}_{current_time}"
    print("index name:", index_name)
    number_of_shards = kwargs.get('number_of_shards', 1)
    number_of_replicas = kwargs.get('number_of_replicas', 0)
    dimensions = kwargs.get('dimensions')

    if dimensions is None and len(documents) > 0:
        document = documents[0]
        dimensions = len(document.get('embedding') or [])

    es_client = Elasticsearch(connection_string)

    print(f'Connecting to Elasticsearch at {connection_string}')

    index_settings = {
        "settings": {
            "number_of_shards": number_of_shards,
            "number_of_replicas": number_of_replicas,
        },
        "mappings": {
            "properties": {
                "chunk": {"type": "text"},
                "document_id": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": dimensions}
            }
        }
    }

    # Recreate the index by deleting if it exists and then creating with new settings
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)
        print(f'Index {index_name} deleted')

    es_client.indices.create(index=index_name, body=index_settings)
    print('Index created with properties:')
    print(json.dumps(index_settings, indent=2))
    print('Embedding dimensions:', dimensions)

    count = len(documents)
    print(f'Indexing {count} documents to Elasticsearch index {index_name}')
    for idx, document in enumerate(documents):
        if idx % 100 == 0:
		        print(f'{idx + 1}/{count}')

        if isinstance(document['embedding'], np.ndarray):
            document['embedding'] = document['embedding'].tolist()

        es_client.index(index=index_name, document=document)

    return [[d['embedding'] for d in documents[:10]]]

In [40]:
doc_exp = elasticsearch(dict_doc)

index name: documents_20240822_5858


KeyError: 0