# Read

> Read X for llm context

In [None]:
#| default_exp read

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from httpx import get

In [None]:
#| export
def read_gist(s, user): 
    # This should just be copy/paste url instead
    return get(f'https://gist.githubusercontent.com/{user}/{s}').text

Todo ideas:
- [X] read_url
- [X] read_gist
- [X] read_gh_file
- [ ] read_ghurl
- [X] read_file
- [ ] read_dir
- [ ] read_path
- [ ] read_yt
- [ ] read_gdoc
- [ ] read_gsheet

One possible interface:

```
read_thing(s)
```

Where the function would be smart enough to look at s and determine if
it is:

- a Github URL
- a YT URL
- a Google Doc URL
- a "plain" URL (not identified as more specific)
- a path to a file on disk
- etc...

But this is a convenience interface.

library should also expose the separate dedicated `read` functions. 

ideally, these should also "just work" when simply given one positional argument, with further arguments being optional keyword args to request more specific behavior when necessary (maybe, output format besides a string).

They should also return the same thing, where that thing is whatever
is easiest to spit into context. str? dict?

To start for now let us suppose:
- each read_ function MUST work with one positional arg and MUST return a string.
- later: optional args, maybe controlling other output formats, such a dictionary, a Claude-optimized bit of XML, etc..



## Defining read_ functions

In [None]:
%%ai
Tell me about the python function or package html2md or html2text


There are two popular Python packages for converting HTML to Markdown:

1. html2text:
   - Widely used and mature library
   - Converts HTML to Markdown-formatted plain text
   - Available via pip: `pip install html2text`
   - Usage:
     ```python
     import html2text
     h = html2text.HTML2Text()
     markdown = h.handle("<h1>Hello World</h1>")
     ```

2. html2markdown:
   - Newer alternative
   - Aims to produce cleaner Markdown output
   - Available via pip: `pip install html2markdown`
   - Usage:
     ```python
     from html2markdown import convert
     markdown = convert("<h1>Hello World</h1>")
     ```

Both libraries are useful for tasks like web scraping, content processing, or converting HTML emails to plain text.

In [None]:
#def read_url_with_playwright(url):
#    "NOT THE REAL FN NAME"
#    from playwrightnb import url2md
#    return url2md(url)

def read_url(url):
    import html2text, httpx
    return html2text.html2text(httpx.get(url).text)
   

In [None]:
def read_gist(s, user):
    return get(f'https://gist.githubusercontent.com/{user}/{s}').text

In [None]:
sample_gist_url = "https://gist.github.com/algal/a490024ad088de1b857531c83abef0a0"
raw_gist_url = "https://gist.githubusercontent.com/algal/a490024ad088de1b857531c83abef0a0/raw/d8b04e5b7c11d5b753b9225978e0216098295e9a/iterm2-url.source"
simpleraw_gist_url = "https://gist.githubusercontent.com/algal/a490024ad088de1b857531c83abef0a0/raw"

In [None]:
print(sample_gist_url)

https://gist.github.com/algal/a490024ad088de1b857531c83abef0a0


In [None]:
print(simpleraw_gist_url)

https://gist.githubusercontent.com/algal/a490024ad088de1b857531c83abef0a0/raw


In [None]:
%%aip
generate python code which uses regexes to go from a URL like
sample_gist_url to the URL in simpleraw_gist_url

In [None]:
import re

def gist_url_to_raw(url):
    pattern = r'https://gist\.github\.com/([^/]+)/([^/]+)'
    match = re.match(pattern, url)
    if match:
        user, gist_id = match.groups()
        return f'https://gist.githubusercontent.com/{user}/{gist_id}/raw'
    # Comment: If the URL doesn't match the expected pattern, we return None
    # You may want to raise an exception or handle this case differently
    return None

# Test the function
result = gist_url_to_raw(sample_gist_url)
print(result)

In [None]:
def read_gist(url):
    "Returns raw gist content, or None"
    import re
    pattern = r'https://gist\.github\.com/([^/]+)/([^/]+)'
    match = re.match(pattern, url)
    if match:
        user, gist_id = match.groups()
        raw_url = f'https://gist.githubusercontent.com/{user}/{gist_id}/raw'
        return httpx.get(raw_url).text
    else:
        return None
    
    

In [None]:
gh_file_url="https://github.com/hamelsmu/getrich-fasthtml/blob/main/.gitignore"
gh_raw_file_url="https://raw.githubusercontent.com/hamelsmu/getrich-fasthtml/refs/heads/main/.gitignore"
print(gh_file_url)
print(gh_raw_file_url)

https://github.com/hamelsmu/getrich-fasthtml/blob/main/.gitignore
https://raw.githubusercontent.com/hamelsmu/getrich-fasthtml/refs/heads/main/.gitignore


In [None]:
%%aip
Please generate regex code to transform gh_file_url-like URLs to gh_raw_file_url-like URLs

In [None]:
import re

def github_url_to_raw(url):
    pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'
    replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4'
    return re.sub(pattern, replacement, url)

# Test the function
result = github_url_to_raw(gh_file_url)
print(result)

# Comment: This function assumes the URL structure remains consistent.
# It may need adjustment if GitHub changes their URL format.
# Also, it doesn't handle cases where the URL doesn't match the expected pattern.
# You might want to add error handling for such cases.

In [None]:
def read_gh_file(url):
    import re
    pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'
    replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4'
    raw_url = re.sub(pattern, replacement, url)
    return httpx.get(raw_url).text

    

In [None]:
s = read_gh_file(gh_file_url)

In [None]:
def read_file(path):
    return open(path,'r').read()


In [None]:
%%aip
Generate Python code which reads all files below a certain path,
concatenating their contents into a single string, adding within
the string delimiter lines which communicate the paths of the
individual files. Use glob patterns, please.

In [None]:
import os
import glob

def read_dir(path, 
             exclude_binaries=True,
             excluded_patterns=[".git/**"],
            verbose=True):
    pattern='**/*'
    result = []
    for file_path in glob.glob(os.path.join(path, pattern), 
                               recursive=True):
        # TODO: conditional here to vierfy file_path does NOT match any excluded patterns
        if os.path.isfile(file_path):
            # TODO: conditional here to skip files which are binaries not text
            if verbos:
                print(f"Including {file_path}")
            result.append(f"--- File: {file_path} ---")
            with open(file_path, 'r', errors='ignore') as f:
                result.append(f.read())
            result.append(f"--- End of {file_path} ---")
    return '\n'.join(result)

# Usage example:
# content = read_files_in_path('/path/to/directory', '**/*.py')
# print(content)

# Comment: This function uses '**/*' as the default pattern, which includes all files.
# You can specify a different pattern like '**/*.py' for Python files only.
# The 'errors="ignore"' in open() helps handle potential encoding issues,
# but you might want to adjust this based on your specific needs.

In [None]:
%%aip
Observe the TODO comments in the code above. Please generate a new
function which fills those TODOs, using only stdlib modules.

In [None]:
import os
import glob
import fnmatch
import mimetypes

def is_binary(file_path):
    mime = mimetypes.guess_type(file_path)[0]
    return mime is not None and not mime.startswith('text')

def read_dir(path, 
             exclude_binaries=True,
             excluded_patterns=[".git/**"],
             verbose=True):
    pattern = '**/*'
    result = []
    for file_path in glob.glob(os.path.join(path, pattern), recursive=True):
        if any(fnmatch.fnmatch(file_path, pat) for pat in excluded_patterns):
            continue
        if os.path.isfile(file_path):
            if exclude_binaries and is_binary(file_path):
                continue
            if verbose:
                print(f"Including {file_path}")
            result.append(f"--- File: {file_path} ---")
            with open(file_path, 'r', errors='ignore') as f:
                result.append(f.read())
            result.append(f"--- End of {file_path} ---")
    return '\n'.join(result)

# Comment: This implementation uses fnmatch for pattern matching and
# mimetypes for binary file detection. Note that mimetypes is not 100% accurate
# for binary detection, so you might want to implement a more robust method
# if accuracy is crucial.