In [1]:
import sys
sys.path.append("./")
sys.path.append("../")

In [5]:
import pprint as pp

In [3]:
md_file = "/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data/resume.md"

with open(md_file, 'r', encoding='utf-8') as file:
    content = file.read()

print(content)
# print(content[:200])  # print a preview of the first 200 characters


## Vivi Liu（劉米婷 Mi-Ting Liu）

**Phone:** (+886) 919139582
**Email:** allnicevivi@gmail.com
**LinkedIn:** https://www.linkedin.com/in/vivi-liu-47a151177/

---

## Summary

Results-driven AI engineer with over 3 years of experience in developing software system using various technologies. Passionate about creating efficient and scalable solutions. Strong problem-solving skills and ability to adapt quickly to new challenges.

---

## Professional Experience

### AI Engineer, NUWA Robotics 
*May 2024 - Present*

- Built a RAG system from scratch, optimizing retrieval with sparse embeddings, weighted search, and metadata enrichment; improved recall rate by 30% and reduced latency by 50% (16s → 8s).
- Developed an LLM evaluation module to log runtime, prompt variations, and test outcomes, enabling systematic experimentation and eliminating redundant tests through data-driven iteration.
- Enhanced development efficiency by 30% using vibe coding methodology, reducing feature build/debug time f

In [41]:
import imp
from typing import Any, List, Tuple, Optional, Dict
import re
from pathlib import Path
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem

from component.base import Node

class MarkdownReader():
    """Markdown parser.

    Extract text from markdown files.
    Returns dictionary with keys as headers and values as the text between headers.
    """

    def __init__(
        self,
        *args: Any,
        remove_hyperlinks: bool = True,
        remove_images: bool = True,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self._remove_hyperlinks = remove_hyperlinks
        self._remove_images = remove_images

    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a markdown file to a dictionary.

        The keys are the headers and the values are the text under each header.

        """
        markdown_tups: List[Tuple[Optional[str], str]] = []
        lines = markdown_text.split("\n")

        current_header = None
        current_lines = []
        in_code_block = False

        current_layer = 0
        for line in lines:
            if line.startswith("```"):
                # This is the end of a code block if we are already in it, and vice versa.
                in_code_block = not in_code_block

            header_match = re.match(r"^#+\s", line)
            if not header_match:
                header_match = re.match(r"^#+", line)
            if not in_code_block and header_match:
                # Upon first header, skip if current text chunk is empty
                if current_header is not None or len(current_lines) > 0:
                    # print(f'current_layer: {current_layer}, current_header: {current_header}')
                    markdown_tups.append((current_layer, current_header, "\n".join(current_lines)))

                current_header = line
                current_layer = len(header_match.group().strip())
                current_lines.clear()
            else:
                current_lines.append(line)
            
        # Append final text chunk
        markdown_tups.append((current_layer, current_header, "\n".join(current_lines)))

        # Postprocess the tuples before returning
        return [
            (
                layer,
                key if key is None else re.sub(r"#", "", key).strip(),
                re.sub(r"<.*?>", "", value),
            )
            for layer, key, value in markdown_tups
        ]

    def remove_images(self, content: str) -> str:
        """Remove images in markdown content."""
        pattern = r"!{1}\[\[(.*)\]\]"
        return re.sub(pattern, "", content)

    def remove_hyperlinks(self, content: str) -> str:
        """Remove hyperlinks in markdown content."""
        pattern = r"\[(.*?)\]\((.*?)\)"
        return re.sub(pattern, r"\1", content)

    def parse_tups(
        self,
        filepath: Path,
        content: str = "",
        errors: str = "ignore",
        fs: Optional[AbstractFileSystem] = None,
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples."""
        if content:
            return self.markdown_to_tups(content)
        
        fs = fs or LocalFileSystem()
        with fs.open(filepath, encoding="utf-8") as f:
            content = f.read().decode(encoding="utf-8")
        if self._remove_hyperlinks:
            content = self.remove_hyperlinks(content)
        if self._remove_images:
            content = self.remove_images(content)
        return self.markdown_to_tups(content)

    def load_data(
        self,
        file: Path,
        content: str="",
        extra_info: Optional[Dict] = None,
        fs: Optional[AbstractFileSystem] = None,
    ) -> List[Node]:
        """Parse file into string."""
        tups = self.parse_tups(file, content, fs=fs)
        results = []
        if not extra_info: extra_info = {}
        extra_info["filename"] = file.name
        # TODO: don't include headers right now
        for layer, header, value in tups:
            if header is None:
                # extra_info["is_gpt_converted"] = True
                results.append(Node(text=value, metadata=extra_info or {}))
            else:
                extra_info[f'Header_{layer}'] = header
                i = 1
                while extra_info.get(f'Header_{layer+i}', None):
                    extra_info.pop(f'Header_{layer+i}')
                    i += 1
                if not value:
                    continue
                    
                headers = '\n'.join([v for k, v in extra_info.items() if k.startswith("Header_")])
                results.append(
                    Node(text=f"{headers}\n{value}", metadata=extra_info or {})
                )
        return results



In [42]:
md_file = Path("/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data/resume.md")


documents = MarkdownReader().load_data(
    file=md_file,
    # content=to_md_content,
    # extra_info={"train_id": self.rag_id}
)

pp.pprint(documents)

[Node(id_='b969b0ca-cccb-45f4-ba24-eaa48a35a955', text='Vivi Liu（劉米婷 Mi-Ting Liu）\n\n**Phone:** (+886) 919139582\n**Email:** allnicevivi@gmail.com\n**LinkedIn:** https://www.linkedin.com/in/vivi-liu-47a151177/\n\n---\n', embedding=None, sparse_embedding=None, node_type='text', metadata={'filename': 'resume.md', 'Header_2': 'Vivi Liu（劉米婷 Mi-Ting Liu）'}),
 Node(id_='3129c529-c0e0-4106-a362-c4316bbb4cac', text='Summary\n\nResults-driven AI engineer with over 3 years of experience in developing software system using various technologies. Passionate about creating efficient and scalable solutions. Strong problem-solving skills and ability to adapt quickly to new challenges.\n\n---\n', embedding=None, sparse_embedding=None, node_type='text', metadata={'filename': 'resume.md', 'Header_2': 'Summary'}),
 Node(id_='97fb6f30-94b3-49ee-9ef8-1244ba508f03', text='Professional Experience\nAI Engineer, NUWA Robotics\n*May 2024 - Present*\n\n- Built a RAG system from scratch, optimizing retrieval with 

In [19]:
import re

response_content = """<thinking>
Step 1: The question asks if I am currently employed.
Step 2: The context shows that I am employed as an AI Engineer at NUWA Robotics from May 2024 to present ("至今").
Step 3: Based on this, I can confirm that I am currently working there.
Step 4: I will answer clearly and warmly, and offer to share more about my current role if interested.
</thinking>

<answer>
是的，我目前仍在 NUWA Robotics 擔任 AI 工程師，從今年五月開始至今。我主要負責從零打造檢索增強生成系統，並與產品及業務團隊密切合作。如果您想了解更多我的工作內容，隨時可以問我！"""

match = re.search(r'<answer>(.*)', response_content, re.DOTALL)
if match:
    final_answer = match.group(1).strip()
    if final_answer in ["None", "Empty Response"]:
        final_answer = None
else:
    final_answer = None
print(final_answer)

是的，我目前仍在 NUWA Robotics 擔任 AI 工程師，從今年五月開始至今。我主要負責從零打造檢索增強生成系統，並與產品及業務團隊密切合作。如果您想了解更多我的工作內容，隨時可以問我！


In [None]:
from pathlib import Path

directory_path = Path("/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data")
lang_folder = [p for p in directory_path.iterdir() if p.is_dir()]

for folder in lang_folder:
    print(folder.name)
    files = list(folder.iterdir())
    print(files)


en
[PosixPath('/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data/en/resume.md'), PosixPath('/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data/en/resume_detail.md')]
zhtw
[PosixPath('/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data/zhtw/resume_detail_zhtw.md')]


In [6]:
directory_path

PosixPath('/Users/viviliu/Documents/10_Vivi/ChatMyCV/backend/data')

In [5]:
files[0].parts[-1]

'en'