In [9]:
import io

import requests
import docx

In [8]:
# !pip uninstall docx -y
!pip install python-docx 

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [25]:
faq_documents = {
    'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw',
    'machine-learning-zoomcamp': '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8',
    'mlops-zoomcamp': '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0',
}

In [27]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

data-engineering-zoomcamp
machine-learning-zoomcamp
mlops-zoomcamp


In [29]:
import json

In [32]:
with open('documents.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [33]:
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [10]:
faq_documents = {
    'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw',
    # 'machine-learning-zoomcamp': '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8',
    # 'mlops-zoomcamp': '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0',
}

In [26]:
faq_documents

{'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw'}

In [27]:
file_id = faq_documents['data-engineering-zoomcamp']
file_id

'19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw'

In [28]:
course_document = read_faq(file_id)

In [30]:
len(course_document)

437

In [31]:
import pprint 
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(course_document[0])

{   'question': 'Course - When will the course start?',
    'section': 'General course-related questions',
    'text': 'The purpose of this document is to capture frequently asked '
            'technical questions\n'
            'The exact day and hour of the course will be 15th Jan 2024 at '
            "17h00. The course will start with the first  “Office Hours'' "
            'live.1\n'
            'Subscribe to course public Google Calendar (it works from Desktop '
            'only).\n'
            'Register before the course starts using this link.\n'
            'Join the course Telegram channel with announcements.\n'
            "Don’t forget to register in DataTalks.Club's Slack and join the "
            'channel.'}


In [22]:
!pip install termcolor

Collecting termcolor
  Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)
Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)
Installing collected packages: termcolor
Successfully installed termcolor-2.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [25]:
import pprint
from termcolor import colored

def pretty_print_dict(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + colored(f"{key}:", 'red'), end=' ')
        if isinstance(value, dict):
            print()
            pretty_print_dict(value, indent+1)
        else:
            print(colored(f"{value}", 'blue'))

# Example dictionary
example_dict = {
    'name': 'John Doe',
    'age': 30,
    'address': {
        'street': '123 Elm Street',
        'city': 'Somewhere',
        'state': 'CA'
    },
    'hobbies': ['reading', 'hiking', 'coding']
}

# Pretty print the dictionary
# pretty_print_dict(example_dict)


In [24]:
pretty_print_dict(course_document[0])

[31mtext:[0m [34mThe purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.[0m
[31msection:[0m [34mGeneral course-related questions[0m
[31mquestion:[0m [34mCourse - When will the course start?[0m
