In [1]:
import re
from typing import List
from langdetect import detect, LangDetectException

import spacy

# Load spaCy model once
nlp = spacy.load("en_core_web_sm")

def is_heading(line: str) -> bool:
    if len(line) > 80 or line.endswith((".", ":", ";", "?", "!")):
        return False
    if line.isupper() or re.match(r"^[A-Z][A-Za-z0-9\s\-]+$", line):
        return True
    doc = nlp(line)
    if sum(tok.pos_ in ("VERB", "AUX") for tok in doc) == 0:
        return True
    return False

def is_imperative(sentence: str) -> bool:
    doc = nlp(sentence.strip())
    if not doc:
        return False
    first = doc[0]
    return (
        first.pos_ == "VERB"
        and "VerbForm=Inf" in first.morph
        and ("Mood=Imp" in first.morph or first.tag_ in {"VB", "VBP"})
    )

def classify_line(line: str) -> str:
    stripped = line.rstrip()
    if not stripped:
        return "blank"
    lower = stripped.lower()

    if re.search(r"\b(todo|to[- ]do|pending|in\s*progress)\b", lower):
        return "checkbox_unchecked"
    if re.search(r"\b(done|completed|finished|✓|✔)\b", lower):
        return "checkbox_checked"
    if re.match(r"^\s*[-*+]\s+", stripped):
        return "bullet"
    if re.match(r"^\s*\d+[\).\s]", stripped):
        return "bullet"
    if is_imperative(stripped):
        return "checkbox_unchecked"
    if is_heading(stripped):
        return "heading"
    return "paragraph"

def transform_to_markdown(text: str) -> str:
    out_lines: List[str] = []
    for raw in text.splitlines():
        kind = classify_line(raw)
        content = raw.strip()

        if kind == "blank":
            out_lines.append("")
        elif kind == "heading":
            out_lines.append(f"## {content.title()}")
        elif kind == "checkbox_unchecked":
            cleaned = re.sub(r"\b(todo|to[- ]do|pending|in\s*progress)\b[:\-]?\s*", "", content, flags=re.I)
            out_lines.append(f"- [ ] {cleaned.strip()}")
        elif kind == "checkbox_checked":
            cleaned = re.sub(r"\b(done|completed|finished|✓|✔)\b[:\-]?\s*", "", content, flags=re.I)
            out_lines.append(f"- [x] {cleaned.strip()}")
        elif kind == "bullet":
            cleaned = re.sub(r"^\s*([-*+]|\d+[\).\s])\s*", "", content)
            out_lines.append(f"- {cleaned}")
        else:
            out_lines.append(content)
            out_lines.append("")  # paragraph separation

    markdown = re.sub(r"\n{3,}", "\n\n", "\n".join(out_lines)).rstrip() + "\n"
    return markdown


In [2]:
sample_input = """
PROJECT ROADMAP
Kickoff meeting scheduled

Todo  : allocate tasks
Design UI
Implement API
DONE   integrate CI/CD

1) gather feedback
2) iterate

Future Ideas
Machine‑learning recommendation engine
"""

markdown_output = transform_to_markdown(sample_input)
print(markdown_output)



## Project Roadmap
## Kickoff Meeting Scheduled

- [ ] : allocate tasks
## Design Ui
- [ ] Implement API
- [x] integrate CI/CD

- gather feedback
- iterate

## Future Ideas
Machine‑learning recommendation engine



In [3]:
import re
import html
from typing import List
import emoji
import spacy

NLP = spacy.load("en_core_web_sm")

INLINE_EMPH_RE = [
    (re.compile(r"\*\*([^*\n]{1,200})\*\*"), r"**\1**"),
    (re.compile(r"(?<!\*)\*([^*\n]{1,200})\*(?!\*)"), r"*\1*"),
    (re.compile(r"_(?!_)([^_\n]{1,200})_(?!_)"), r"_\1_"),
    (re.compile(r"~~([^~\n]{1,200})~~"), r"~~\1~~"),
]

URL_RE = re.compile(r"(?P<url>https?://[^\s<>{}|\]^`]+)", re.I)
IMAGE_LINE_RE = re.compile(r"^\s*image\s*:\s*(?P<alt>[^|]+?)\s*\|\s*(?P<url>\S+)", re.I)

def is_heading(line: str) -> tuple[bool, int]:
    stripped = line.strip()
    if not stripped or len(stripped) > 120 or stripped.endswith("."):
        return False, 0
    
    # Check for markdown headings first
    m = re.match(r"^(#{1,6})\s+(.+)", stripped)
    if m:
        return True, len(m.group(1))
    
    # Check for all uppercase (but not single words or very short)
    if stripped.isupper() and len(stripped) > 3:
        return True, 2
    
    # Check for title case without verbs and not simple task-like phrases
    if re.match(r"^[A-Z][\w\s]{2,50}$", stripped):
        doc = NLP(stripped)
        # Exclude if it starts with a verb (likely a task)
        if doc and len(doc) > 0 and doc[0].pos_ == "VERB":
            return False, 0
        # Include if it has no verbs (likely a heading)
        if sum(t.pos_ in ("VERB", "AUX") for t in doc) == 0:
            return True, 2
    
    return False, 0

def is_task(line: str) -> tuple[bool, bool]:
    stripped = line.strip()
    lower = stripped.lower()
    
    # Check for completion markers
    if re.search(r"\b(done|completed|finished|✓|✔)\b", lower):
        return True, True
    
    # Check for todo markers (but not standalone todo headers)
    if re.search(r"\b(todo|to[- ]do|pending|in\s*progress)\b", lower):
        # If it's just "Todo:" or "Todo:-", it's a header, not a task
        if re.match(r"^\s*(todo|to[- ]do)[:\-]?\s*$", lower):
            return False, False
        return True, False
    
    # Check for imperative verbs (infinitive form)
    doc = NLP(stripped)
    if doc and len(doc) > 0:
        # Check if it starts with a verb in imperative form
        first_token = doc[0]
        if first_token.pos_ == "VERB":
            # Common imperative verb tags: VB (base form), VBP (present)
            if first_token.tag_ in ("VB", "VBP"):
                return True, False
            # Also check for specific imperative patterns
            if first_token.lemma_.lower() in ("allocate", "design", "implement", "create", "build", "develop", "test", "deploy", "review", "update", "fix", "add", "remove", "configure", "setup", "install"):
                return True, False
    
    return False, False

def emphasise_inline(text: str) -> str:
    def _url_sub(m):
        url = m.group("url")
        return f"[{url}]({url})"
    
    text = URL_RE.sub(_url_sub, text)
    text = emoji.emojize(text, language="alias")
    
    for rex, rep in INLINE_EMPH_RE:
        text = rex.sub(rep, text)
    
    return text

def clean_table_line(line: str) -> str:
    cells = [c.strip() for c in line.strip().strip("|").split("|")]
    return "| " + " | ".join(cells) + " |"

def mdify(raw: str) -> str:
    lines = raw.strip().split("\n")  # Strip leading/trailing whitespace first
    md_lines: List[str] = []
    i = 0
    title = author = None

    # Handle YAML frontmatter
    if len(lines) >= 3 and lines[0].strip() == "---":
        i = 1
        # Parse YAML frontmatter
        while i < len(lines) and lines[i].strip() != "---":
            line = lines[i].strip()
            if line.lower().startswith("title:"):
                title = line.split(":", 1)[1].strip().strip('"\'')
            elif line.lower().startswith("author:"):
                author = line.split(":", 1)[1].strip().strip('"\'')
            i += 1
        
        # Skip the closing --- if found
        if i < len(lines) and lines[i].strip() == "---":
            i += 1
        
        # Add title and author in proper format
        if title:
            md_lines.append(f"# **{title}**")
            md_lines.append("")
        if author:
            md_lines.append(f"*{author}*")
            md_lines.append("")
        if title or author:
            md_lines.append("---")
            md_lines.append("")

    while i < len(lines):
        l = lines[i]

        # Image
        m_img = IMAGE_LINE_RE.match(l)
        if m_img:
            md_lines.append(f"![{m_img.group('alt').strip()}]({m_img.group('url').strip()})")
            md_lines.append("")
            i += 1
            continue

        # Horizontal rule
        if re.match(r"^\s*(-{3,}|\*{3,}|_{3,})\s*$", l):
            md_lines.append("---")
            md_lines.append("")
            i += 1
            continue

        # Fenced code block or math block
        if re.match(r"^\s*```", l) or re.match(r"^\s*\$\$", l):
            md_lines.append(l)
            i += 1
            # For math blocks, look for closing $
            if re.match(r"^\s*\$\$", l):
                while i < len(lines) and not re.match(r"^\s*\$\$", lines[i]):
                    md_lines.append(lines[i])
                    i += 1
                if i < len(lines):
                    md_lines.append(lines[i])
                    i += 1
            else:
                # For code blocks, look for closing ```
                while i < len(lines) and not re.match(r"^\s*```", lines[i]):
                    md_lines.append(lines[i])
                    i += 1
                if i < len(lines):
                    md_lines.append(lines[i])
                    i += 1
            md_lines.append("")
            continue

        # Blockquote
        if l.strip().startswith(">"):
            md_lines.append(l)
            i += 1
            continue

        # Tables
        if "|" in l and not is_heading(l)[0]:
            table_rows = []
            while i < len(lines) and "|" in lines[i]:
                table_rows.append(clean_table_line(lines[i]))
                i += 1
            if len(table_rows) > 1 and not re.match(r"^\s*\|?[\s:-]+\|", table_rows[1]):
                cols = table_rows[0].count("|") - 1
                divider = "|" + "|".join([" --- "] * cols) + "|"
                table_rows.insert(1, divider)
            md_lines.extend(table_rows)
            md_lines.append("")
            continue

        # Todo headers (like "Todo:-")
        if re.match(r"^\s*(todo|to[- ]do)[:\-]?\s*$", l.strip(), re.I):
            todo_text = l.strip()
            # Remove trailing colon/dash and make it a heading
            todo_text = re.sub(r"[:\-]\s*$", "", todo_text)
            md_lines.append(f"### {todo_text.title()}")
            md_lines.append("")
            i += 1
            continue

        # TASKS — check before lists and headings
        is_task_flag, done_flag = is_task(l)
        if is_task_flag:
            # Remove task markers and clean up
            body = re.sub(
                r"\b(todo|to[- ]do|pending|in\s*progress|done|completed|finished|✓|✔)\b[:\-]?\s*",
                "",
                l,
                flags=re.I,
            )
            body = emphasise_inline(body.strip())
            body = body.strip("- ").strip()
            if body:  # Only add if there's actual content
                md_lines.append(f"- [{'x' if done_flag else ' '}] {body}")
            i += 1
            continue

        # Check if this line follows a todo header and treat as task
        prev_line_idx = i - 1
        while prev_line_idx >= 0 and not lines[prev_line_idx].strip():
            prev_line_idx -= 1
        
        if (prev_line_idx >= 0 and 
            re.match(r"^\s*(todo|to[- ]do)[:\-]?\s*$", lines[prev_line_idx].strip(), re.I)):
            # This line follows a todo header, treat it as a task
            body = emphasise_inline(l.strip())
            if body:
                md_lines.append(f"- [ ] {body}")
            i += 1
            continue

        # Lists
        m_bullet = re.match(r"^\s*(?:[-*+]|(\d+)[.)])\s+(.*)", l)
        if m_bullet:
            marker_num = m_bullet.group(1)
            content = emphasise_inline(m_bullet.group(2))
            md_lines.append(f"{marker_num+'.' if marker_num else '-'} {content}")
            i += 1
            continue

        # Headings
        is_head, level = is_heading(l)
        if is_head:
            heading_text = l.strip()
            # Remove any existing markdown heading markers
            heading_text = re.sub(r"^#+\s*", "", heading_text)
            md_lines.append("#" * level + " " + emphasise_inline(heading_text.title()))
            md_lines.append("")
            i += 1
            continue

        # Footnotes
        if re.match(r"^\[\^.+\]:", l.strip()):
            md_lines.append(l.strip())
            i += 1
            continue

        # Raw HTML
        if l.lstrip().startswith("<"):
            md_lines.append(l)
            i += 1
            continue

        # Blank line
        if not l.strip():
            md_lines.append("")
            i += 1
            continue

        # Paragraph
        md_lines.append(emphasise_inline(html.escape(l.strip(), quote=False)))
        md_lines.append("")
        i += 1

    # Clean up multiple blank lines and ensure proper spacing
    result = []
    prev_blank = False
    for line in md_lines:
        if line == "":
            if not prev_blank:
                result.append(line)
            prev_blank = True
        else:
            result.append(line)
            prev_blank = False
    
    # Join and clean up final output
    md = "\n".join(result).strip() + "\n"
    return md


if __name__ == "__main__":
    sample = """---
title: Project X
author: Debrup
---

PROJECT ROADMAP
Kickoff meeting scheduled

Todo:-
Implement API
DONE integrate CI/CD

1) gather feedback
2) iterate

Image: mascot | https://example.com/img/mascot.png

> "Code is like humor. When you have to explain it, it's bad."

Here's a link: https://github.com/spaCy/spaCy

Some *italic*, some **bold**, some ~~obsolete~~.

Inline math $E = mc^2$ and display:

$
\\int_{a}^{b} f(x)dx
$

| Stage | Owner | ETA |
|-------|-------|-----|
| Alpha | @alice | 2025‑08‑01 |

Future Ideas
Machine‑learning recommendation engine

[^1]: We'll revisit after MVP.

---
"""
    print(mdify(sample))

# **Project X**

*Debrup*

---

## Project Roadmap

Kickoff meeting scheduled

- [ ] Implement API
- [x] integrate CI/CD

1. gather feedback
2. iterate

![mascot](https://example.com/img/mascot.png)

> "Code is like humor. When you have to explain it, it's bad."

Here's a link: [https://github.com/spaCy/spaCy](https://github.com/spaCy/spaCy)

Some *italic*, some **bold**, some ~~obsolete~~.

Inline math $E = mc^2$ and display:

$

\int_{a}^{b} f(x)dx

$

| Stage | Owner | ETA |
| ------- | ------- | ----- |
| Alpha | @alice | 2025‑08‑01 |

## Future Ideas

Machine‑learning recommendation engine

[^1]: We'll revisit after MVP.

---

