In [None]:
#| default_exp utils

# Utils
Some util functions

In [None]:
#|hide
from nbdev.showdoc import *

In [None]:
#|export
import re
import yaml
from pathlib import Path
import glob
from typing import List, Dict, Any, Tuple
from difflib import SequenceMatcher
from urllib.parse import urlparse


In [None]:
#|export
def exclude_front_matter(content: str) -> str:
    """Remove front matter from markdown content"""
    front_matter_end = content.find('---', 3)
    if front_matter_end != -1:
        return content[front_matter_end + 3:].strip()
    return content


In [None]:
#| test
test_content = """---
title: Test Title
date: 2024-01-27
---
# Main Content
This is the actual content."""

result = exclude_front_matter(test_content)
assert result == "# Main Content\nThis is the actual content."

# Test with no front matter
test_content_no_frontmatter = "# Just content\nNo front matter here"
assert exclude_front_matter(test_content_no_frontmatter) == test_content_no_frontmatter


In [None]:

#|export
def normalize_text(text: str) -> str:
    """Normalize text by removing extra whitespace"""
    return re.sub(r'\s+', ' ', text).strip()


In [None]:

#|export
def extract_frontmatter(md_content: str) -> Tuple[str, str, str, str, List[str]]:
    """Extract metadata from markdown frontmatter
    Returns: title, publishDate, excerpt, image, tags"""
    # Extract YAML front matter
    yaml_front_matter = md_content.split("---")[1]
    # Load YAML front matter
    front_matter_data = yaml.safe_load(yaml_front_matter)
    
    return (
        front_matter_data.get("title"),
        front_matter_data.get("publishDate"),
        front_matter_data.get("excerpt"),
        front_matter_data.get('image'),
        front_matter_data.get("tags", [])
    )


In [None]:
#|test
test_md = """---
publishDate: 2024-01-27
title: Test Article
excerpt: Test excerpt
image: ~/assets/images/test.png
tags:
  - Python
  - Testing
---
# Content here
"""

title, pub_date, excerpt, image, tags = extract_frontmatter(test_md)
assert title == "Test Article"
assert str(pub_date) == "2024-01-27"  # Convert to string for comparison
assert excerpt == "Test excerpt"
assert image == "~/assets/images/test.png"
assert tags == ["Python", "Testing"]


In [None]:

#| export
def extract_markdown_urls_with_tags(md_content: str) -> Dict[str, Dict]:
    """Extract URLs and their metadata from markdown content"""
    markdown_urls = {}
    lines = md_content.split('\n')
    for line_number, line in enumerate(lines, start=1):
        urls = re.finditer(r'\[(.*?)\]\((.*?)\)', line)
        for match in urls:
            title = match.group(1)
            url = match.group(2)
            if url not in markdown_urls:
                markdown_urls[url] = {'titles': [], 'lines': []}
            markdown_urls[url]['titles'].append(title)
            markdown_urls[url]['lines'].append(line_number)
    return markdown_urls


In [None]:
#|test
with open('../sample/example.md', 'r') as f:
    test_content = f.read()
    
urls_data = extract_markdown_urls_with_tags(test_content)

# Test specific URLs we know are in the file
assert 'https://emdadelgaz.com' in urls_data
assert 'https://awazly.com/' in urls_data

# Test structure of returned data
for url, data in urls_data.items():
    assert 'titles' in data
    assert 'lines' in data


In [None]:

#| export
def extract_markdown_images(file_content: str) -> List[Dict[str, str]]:
    """Extract images and their alt text from markdown content"""
    image_pattern = r'\!\[(.*?)\]\((.*?)\)'
    matches = re.findall(image_pattern, file_content)
    return [{'alt_text': alt_text, 'url': url} for alt_text, url in matches]


In [None]:
#|test    
images = extract_markdown_images(test_content)

# Test that we found the images
assert len(images) > 0
assert any('Iron man photo' == img['alt_text'] for img in images)
assert any('~/assets/images/28.png' == img['url'] for img in images)


In [None]:
#| export
def detect_numbers(text: str) -> List[str]:
    """Extract phone numbers from text"""
    phone_regex = re.compile(r"(\+420)?\s*?(\d{3})\s*?(\d{3})\s*?(\d{3})")
    groups = phone_regex.findall(text)
    return ["".join(g) for g in groups]



In [None]:
detect_numbers(test_content)

['010136468', '966503139', '010051346']

In [None]:
#|test
# Test phone numbers from the content
numbers = detect_numbers(test_content)
assert '966503139' in numbers  # Testing for specific number
assert len(numbers) > 0  # Should find at least one number


In [None]:

#|export
def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate similarity ratio between two texts"""
    return SequenceMatcher(None, text1, text2).ratio()



In [None]:
#|test
# Test exact match
assert calculate_similarity("hello", "hello") == 1.0

# Test completely different strings
assert calculate_similarity("hello", "world") < 0.5

# Test partial similarity
assert 0.5 < calculate_similarity("hello world", "hello there") < 1.0

# Test case sensitivity
text1 = "Hello World"
text2 = "hello world"
assert calculate_similarity(text1, text2) < 1.0


In [None]:

#|export
def get_file_paths(file_path):
    """
    get the file paths
    """

    return glob.glob(file_path)

In [None]:
#|export
def get_internal_urls(urls, target_domain):
    """
    Get Internal URLs from URLs by the target domain
    """
    related_urls = []
    for url in urls:
        parsed_url = urlparse(url)
        # Check for exact domain match only
        if parsed_url.netloc == target_domain:
            related_urls.append(url)
    return related_urls


In [None]:
#|test
# Test URLs
test_urls = [
    'https://emdadelgaz.com/about',
    'https://emdadelgaz.com/contact',
    'https://awazly.com/',
    'https://example.com/test',
    'https://emdadelgaz/about.com'
]

# Test with emdadelgaz.com domain
internal_urls = get_internal_urls(test_urls, 'emdadelgaz.com')
assert len(internal_urls) == 2
assert 'https://emdadelgaz.com/about' in internal_urls
assert 'https://emdadelgaz.com/contact' in internal_urls
assert 'https://awazly.com/' not in internal_urls

# Test with actual content from file
urls_data = extract_markdown_urls_with_tags(test_content)
internal = get_internal_urls(urls_data.keys(), 'emdadelgaz.com')
assert len(internal) > 0


In [None]:
#|export
def get_file_name(file_path):
    """get the file name"""
    return file_path.split("/")[-1][:-3]

In [None]:
#|export
def get_external_urls(urls, target_domain):
    """
    Return the number of Internal Urls from markdown content by Target Domain
    """
    related_urls = []
    for url in urls:
        parsed_url = urlparse(url)
        if (
            not parsed_url.netloc == target_domain
            and not parsed_url.netloc == target_domain.split(".")[0]
            and not any(
                parsed_url.path.lower().endswith(ext)
                for ext in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp")
            )
        ):
            related_urls.append(url)
    return related_urls

In [None]:
#|test
# Test URLs including internal, external, and image URLs
test_urls = [
    'https://emdadelgaz.com/about',
    'https://example.com/page',
    'https://test.com/doc.pdf',
    'https://images.com/test.png',
    'https://awazly.com/',
    'https://emdadelgaz.com/logo.jpg'
]

# Test with emdadelgaz.com domain
external_urls = get_external_urls(test_urls, 'emdadelgaz.com')

# Debug print
print("Found external URLs:", external_urls)

# Test assertions
assert len(external_urls) == 3  # example.com, test.com, and awazly.com
assert 'https://example.com/page' in external_urls
assert 'https://test.com/doc.pdf' in external_urls
assert 'https://awazly.com/' in external_urls
assert 'https://images.com/test.png' not in external_urls  # Should exclude image URLs
assert 'https://emdadelgaz.com/about' not in external_urls  # Should exclude internal URLs


Found external URLs: ['https://example.com/page', 'https://test.com/doc.pdf', 'https://awazly.com/']


In [None]:
#|export
def get_heads_info(file_path):
    """
    Get the Number of Headings for each type with the line number, content, and length
    """
    headings = []

    with open(file_path, "r") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()
            if line.startswith("# "):
                headings.append(
                    {
                        "type": "h1",
                        "line_number": line_number,
                        "content": line.strip("# ").strip(),
                        "length": len(line.strip("# ").strip()),
                    }
                )
            elif line.startswith("## "):
                headings.append(
                    {
                        "type": "h2",
                        "line_number": line_number,
                        "content": line.strip("## ").strip(),
                        "length": len(line.strip("## ").strip()),
                    }
                )
            elif line.startswith("### "):
                headings.append(
                    {
                        "type": "h3",
                        "line_number": line_number,
                        "content": line.strip("### ").strip(),
                        "length": len(line.strip("### ").strip()),
                    }
                )
            elif line.startswith("#### "):
                headings.append(
                    {
                        "type": "h4",
                        "line_number": line_number,
                        "content": line.strip("#### ").strip(),
                        "length": len(line.strip("#### ").strip()),
                    }
                )
            elif line.startswith("##### "):
                headings.append(
                    {
                        "type": "h5",
                        "line_number": line_number,
                        "content": line.strip("##### ").strip(),
                        "length": len(line.strip("##### ").strip()),
                    }
                )
            elif line.startswith("###### "):
                headings.append(
                    {
                        "type": "h6",
                        "line_number": line_number,
                        "content": line.strip("###### ").strip(),
                        "length": len(line.strip("###### ").strip()),
                    }
                )

    return headings

In [None]:
#|hide
def get_num_heads(h_elements):
    """
    Return A list continas the length of each heading

    Takes the heading info from `get_heads_info`
    """
    #! Update this to work with the new dict structure
    return list(map(len, h_elements.values()))

In [None]:
#|test

headings = get_heads_info('../sample/example.md')

# Debug print
print("Found headings:", headings)
# Test assertions
assert len(headings) > 0  # Should find some headings
assert all(h['type'] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] for h in headings)
assert all('line_number' in h for h in headings)
assert all('content' in h for h in headings)
assert all('length' in h for h in headings)

# Test specific headings we know are in the file
h1_headings = [h for h in headings if h['type'] == 'h1']
h2_headings = [h for h in headings if h['type'] == 'h2']
assert any('This is me Kareem' in h['content'] for h in h1_headings)
assert any('How do you know me!' in h['content'] for h in h2_headings)


Found headings: [{'type': 'h1', 'line_number': 15, 'content': 'This is me Kareem', 'length': 17}, {'type': 'h1', 'line_number': 17, 'content': 'This is Kareem Also', 'length': 19}, {'type': 'h2', 'line_number': 21, 'content': 'How do you know me!', 'length': 19}, {'type': 'h2', 'line_number': 25, 'content': 'oh no! iron man!', 'length': 16}, {'type': 'h2', 'line_number': 34, 'content': 'References', 'length': 10}, {'type': 'h3', 'line_number': 42, 'content': 'Books', 'length': 5}, {'type': 'h4', 'line_number': 48, 'content': 'nbdev is super cool!', 'length': 20}, {'type': 'h5', 'line_number': 50, 'content': 'Test Deriven Developement is a life changing!', 'length': 45}, {'type': 'h6', 'line_number': 52, 'content': 'I am an Love with best girl in the whole world!', 'length': 47}]


In [None]:
#|export
def main_keyword_in_start(file_content, keyword, percent=10):
    """
    Find if the Keyword in File Content
    Default Percent is 10% of File Content
    """
    if keyword in file_content[: int(len(file_content) * percent / 100)]:
        return True
    else:
        return False