In [1]:
# pip install markitdown

In [2]:
from markitdown import MarkItDown

def ConvertToMarkdown(fileName = 'Unilife/Bargate_house.docx') ->str:
  md = MarkItDown()
  result = md.convert(fileName)
  print(result.markdown)
  # print(result.title)
  # print(result.text_content)

  return result.text_content




In [3]:
ConvertToMarkdown()

### **General Information**

1. **What is Unilife Bargate House?**Unilife Bargate House is a newly renovated student accommodation property in Southampton, opening in September 2024. It combines modern living with a rich history, offering self-contained studios and exceptional communal spaces.
2. **What types of student accommodation does Unilife Bargate House offer?**Unilife Bargate House offers six types of self-contained studios: Classic, Classic Plus, Premium, Premium Plus, VIP, and VIP Plus, catering to different preferences and budgets.
3. **Is Unilife Bargate House suitable for both undergraduate and postgraduate students?**Yes, Unilife Bargate House welcomes both undergraduate and postgraduate students, offering a supportive environment for all academic levels.
4. **How many rooms are available in Bargate House?**Bargate House has 122 self-contained studios, providing a variety of options to suit individual preferences.
5. **How many floors are there in Bargate House?**The prop

'### **General Information**\n\n1. **What is Unilife Bargate House?**Unilife Bargate House is a newly renovated student accommodation property in Southampton, opening in September 2024. It combines modern living with a rich history, offering self-contained studios and exceptional communal spaces.\n2. **What types of student accommodation does Unilife Bargate House offer?**Unilife Bargate House offers six types of self-contained studios: Classic, Classic Plus, Premium, Premium Plus, VIP, and VIP Plus, catering to different preferences and budgets.\n3. **Is Unilife Bargate House suitable for both undergraduate and postgraduate students?**Yes, Unilife Bargate House welcomes both undergraduate and postgraduate students, offering a supportive environment for all academic levels.\n4. **How many rooms are available in Bargate House?**Bargate House has 122 self-contained studios, providing a variety of options to suit individual preferences.\n5. **How many floors are there in Bargate House?**T

In [None]:


# Models for FAQ Pipeline request and response
# request (file path, user_id)
# response (status,csv_path,message)



In [None]:
import re
import csv
import pandas as pd
import os
import json
from io import StringIO
from typing import Tuple
from pydantic import BaseModel
from typing import List, Optional
from enum import Enum
# Enum for status of FAQ pipeline to save in db and show status in UI
class Status(Enum):
    PENDING = "Pending"
    IN_PROGRESS = "In Progress"
    COMPLETED = "Completed"
    FAILED = "Failed"
    CANCELLED = "Cancelled"
class FAQPipelineRequest(BaseModel):
    file_path: str
    user_id: str

class FAQPipelineResponse(BaseModel):
    status: Status
    csv_path: Optional[str] = None
    message: Optional[str] = None
def extract_sections_and_faqs(markdown_text)->Tuple[List[str], List[dict]]:
    """
    Extract sections, questions, and answers from markdown text.
    """
    # Split the text into lines for processing
    lines = markdown_text.split('\n')
    
    sections = []
    current_section = None
    current_question = None
    current_answer = []
    faqs = []
    
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for section header with ### prefix
        section_match = re.match(r'^### \*\*(.*?)(:?)\*\*$', line)
        
        # Check for section header without ### prefix (surrounded by empty lines)
        if not section_match and line.startswith('**') and line.endswith('**'):
            # Check if this line is surrounded by empty lines
            prev_line_empty = (i == 0 or not lines[i-1].strip())
            next_line_empty = (i == len(lines)-1 or not lines[i+1].strip())
            
            if prev_line_empty and next_line_empty:
                section_match = re.match(r'^\*\*(.*?)\*\*$', line)
        
        if section_match:
            # Save the current section's data if we're moving to a new section
            if current_section and current_question and current_answer:
                faqs.append({
                    'section': current_section,
                    'question': current_question,
                    'answer': '\n'.join(current_answer).strip()
                })
                current_question = None
                current_answer = []
            
            current_section = section_match.group(1).strip()
            if current_section.startswith("Name: "):
                current_section = current_section[6:].strip()  # Remove "Name: " prefix
            sections.append(current_section)
            
        # Check for numbered question
        elif re.match(r'^\d+\. \*\*(.*?)\*\*', line):
            # If we already have a question in progress, save it first
            if current_question and current_answer:
                faqs.append({
                    'section': current_section,
                    'question': current_question,
                    'answer': '\n'.join(current_answer).strip()
                })
                current_answer = []
            
            # Extract the new question
            question_match = re.match(r'^\d+\. \*\*(.*?)\*\*', line)
            current_question = question_match.group(1).strip()
            
            # Extract any answer text on the same line after the question
            remaining = line[question_match.end():].strip()
            if remaining:
                current_answer.append(remaining)
        
        # If not a section or question, it's part of the answer
        elif current_question is not None:
            current_answer.append(line)
        
        i += 1
    
    # Don't forget to add the last FAQ
    if current_section and current_question and current_answer:
        faqs.append({
            'section': current_section,
            'question': current_question,
            'answer': '\n'.join(current_answer).strip()
        })

    return sections, faqs

def create_faq_dataset(markdown_text, output_file='Bargate_house.csv') ->FAQPipelineResponse:
    """
    Process markdown text and create a dataset suitable for LLM fine-tuning.
    """
    # Extract the data
    sections, faqs = extract_sections_and_faqs(markdown_text)
    
    # Clean up the answers (remove extra whitespace), subsection, etc
    for i, faq in enumerate(faqs):
        # Remove extra whitespace
        faq['answer'] = re.sub(r'\n\s*\n+', '\n\n', faq['answer']).strip()
        # Remove sub section and empty lines like ####
        # faq['answer'] = re.sub(r'####.*?\n', '', faq['answer']).strip()
        answer = faq['answer']
        lines = answer.split('\n')
        faq['answer'] = '\n'.join([line for line in lines if line.strip() and not line.startswith('####')])
        subsection  = re.search(r'####.*?\n', faq['answer'])
        if subsection:
            print(f"section: {sections[i-1]}, subsection: {subsection.group()}")
            faq['answer'] = faq['answer'][subsection.end():].strip()
        
        # sections[i] = sections[i] + f" {subsection}"
    for i, faq in enumerate(faqs):
        faq['id'] = i + 1
    df = pd.DataFrame(faqs)
    
    cols = ['id', 'section', 'question', 'answer']
    df = df[cols]
    df.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL)
    print(f"Created CSV dataset '{output_file}' with {len(faqs)} FAQ entries from {len(sections)} unique sections.")
    # FAQPipelineResponse
    return FAQPipelineResponse(status = Status.COMPLETED,csv_path = output_file,message = "FAQ Dataset created successfully")

def faq_pipline(request: FAQPipelineRequest) -> FAQPipelineResponse:
    """
    Create FAQ dataset from markdown text.
    """
    # Convert markdown to text
    text = ConvertToMarkdown(fileName=request.file_path)
    # Create FAQ dataset
    output_file = request.file_path.split('/')[1].replace('.docx', '.csv')
    return create_faq_dataset(text, output_file = output_file)

In [9]:
# dummy pipeline test
file_path = 'Unilife/Bargate_house.docx'
# FAQRequest
request_object = FAQPipelineRequest(file_path=file_path,user_id = 'ishaan')
faq_pipline(request_object)


### **General Information**

1. **What is Unilife Bargate House?**Unilife Bargate House is a newly renovated student accommodation property in Southampton, opening in September 2024. It combines modern living with a rich history, offering self-contained studios and exceptional communal spaces.
2. **What types of student accommodation does Unilife Bargate House offer?**Unilife Bargate House offers six types of self-contained studios: Classic, Classic Plus, Premium, Premium Plus, VIP, and VIP Plus, catering to different preferences and budgets.
3. **Is Unilife Bargate House suitable for both undergraduate and postgraduate students?**Yes, Unilife Bargate House welcomes both undergraduate and postgraduate students, offering a supportive environment for all academic levels.
4. **How many rooms are available in Bargate House?**Bargate House has 122 self-contained studios, providing a variety of options to suit individual preferences.
5. **How many floors are there in Bargate House?**The prop

FAQPipelineResponse(status=<Status.COMPLETED: 'Completed'>, csv_path='Bargate_house.csv', message='FAQ Dataset created successfully')