# Read

> Read X for llm context

In [1]:
#| default_exp read

In [2]:
#| hide
from nbdev.showdoc import *

## Goals

Todo ideas:
- [X] read_url
  - [ ] should enhance to use the existing version which takes a css selector
- [ ] read_nb (smart enough to use a nbdev_clean etc..)
- [X] read_gist
- [X] read_gh_file
- [ ] read_ghurl
- [X] read_file
- [X] read_dir
  - [ ] should enhance not just to read unicode text, but to adapt to PDFs, ipynbs, etc.
- [X] read_pdf
- [ ] read_msword
- [ ] read_gdoc
- [ ] read_yt
- [X] read_yt_transcript
- [X] read_gsheet

One possible interface:

```
read_thing(s)
```

Where the function would be smart enough to look at s and determine if
it is:

- a Github URL
- a YT URL
- a Google Doc URL
- a "plain" URL (not identified as more specific)
- a path to a file on disk
- etc...

But this is a convenience interface.

library should also expose the separate dedicated `read` functions. 

ideally, these should also "just work" when simply given one positional argument, with further arguments being optional keyword args to request more specific behavior when necessary (maybe, output format besides a string).

They should also return the same thing, where that thing is whatever
is easiest to spit into context. str? dict?

To start for now let us suppose:
- each read_ function MUST work with one positional arg and MUST return a string.
- later: optional args, maybe controlling other output formats, such a dictionary, a Claude-optimized bit of XML, etc..



requirements.txt

```
PyPDF2
httpx
youtube_transcript_api
pytube
```

## Imports

In [3]:
#| export
import httpx 
import html2text
from fastcore.all import delegates, ifnone

import re, os, glob, string
import requests
import fnmatch, mimetypes

from PyPDF2 import PdfReader
from toolslm.download import html2md, read_html

In [4]:
# from aimagic import create_magic,models
# create_magic(models[1])

## Defining read_ functions

### URL

In [5]:
#| export
def read_url(url, # url to read
             heavy=False, # Use contactless browser
             sel=None,
             **kwargs): 
    "Reads a url and converts to markdown"
    if heavy: 
        from playwrightnb import url2md
        url2md(url,sel=ifnone(sel,'body'), **kwargs)
    return read_html(url,**kwargs)

In [6]:
# httpx.get 
read_url('https://docs.fastht.ml/')[:200]

'  * Home\n  * Learn\n\n  * __\n  * __\n\n__\n\n  1. Get Started\n\n  * Get Started\n\n  * Tutorials __\n\n    * FastHTML By Example\n\n    * Web Devs Quickstart\n\n    * JS App Walkthrough\n\n    * Using Jupyter to write'

In [None]:
#| eval: false
# contactless browser w/ playwrightnb
read_url('https://docs.fastht.ml/', True)[:200]

### Github

#### Gist

In [8]:
#| export
def read_gist(url):
    "Returns raw gist content, or None"
    pattern = r'https://gist\.github\.com/([^/]+)/([^/]+)'
    match = re.match(pattern, url)
    if match:
        user, gist_id = match.groups()
        raw_url = f'https://gist.githubusercontent.com/{user}/{gist_id}/raw'
        return httpx.get(raw_url).text
    else:
        return None

In [9]:
sample_gist_url = "https://gist.github.com/algal/a490024ad088de1b857531c83abef0a0"
read_gist("https://gist.github.com/algal/a490024ad088de1b857531c83abef0a0")[:200]

"#!/usr/bin/env python3\nimport os, os.path, sys, urllib.parse, base64, subprocess\n\ndef on_iterm2(): return 'ITERM_SESSION_ID' in os.environ or os.environ.get('LC_TERMINAL','') == 'iTerm2'\n\ndef on_macOS"

#### URL

In [10]:
%%aip 0
Please generate regex code to transform gh_file_url-like URLs to gh_raw_file_url-like URLs

In [11]:
def github_url_to_raw(url):
    pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'
    replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4'
    return re.sub(pattern, replacement, url)

In [12]:
github_url_to_raw("https://github.com/hamelsmu/getrich-fasthtml/blob/main/.gitignore")

'https://raw.githubusercontent.com/hamelsmu/getrich-fasthtml/refs/heads/main/.gitignore'

#### File

In [13]:
#| export
def read_gh_file(url):
    pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'
    replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4'
    raw_url = re.sub(pattern, replacement, url)
    return httpx.get(raw_url).text

In [14]:
read_gh_file("https://github.com/hamelsmu/getrich-fasthtml/blob/main/.gitignore")[:200]

'404: Not Found'

### Local Files

####  Files

In [15]:
#| export
def read_file(path):
    return open(path,'r').read()

In [16]:
%%aip 0
Generate Python code which reads all files below a certain path,
concatenating their contents into a single string, adding within
the string delimiter lines which communicate the paths of the
individual files. Use glob patterns, please.

In [17]:
%%aip 0
Observe the TODO comments in the code above. Please generate a new
function which fills those TODOs, using only stdlib modules.

In [18]:
#|export
def is_unicode(filepath, sample_size=1024):
    try:
        with open(filepath, 'r') as file:
            sample = file.read(sample_size)
        return True
    except UnicodeDecodeError:
        return False

In [19]:
assert is_unicode('_quarto.yml')

#### Directory

In [20]:
#| export
def read_dir(path, 
             exclude_non_unicode=True,
             included_patterns=["*"],
             excluded_patterns=[".git/**"],
             verbose=True,
             as_string=True):
    pattern = '**/*'
    result = {}
    for file_path in glob.glob(os.path.join(path, pattern), recursive=True):
        if any(fnmatch.fnmatch(file_path, pat) for pat in excluded_patterns):
            continue
        if not any(fnmatch.fnmatch(file_path, pat) for pat in included_patterns):
            continue
        if os.path.isfile(file_path):
            if exclude_non_unicode and not is_unicode(file_path):
                continue
            if verbose:
                print(f"Including {file_path}")
            with open(file_path, 'r', errors='ignore') as f:
                result[file_path] = f.read()
    if as_string:
        return '\n'.join([f"--- File: {file_path} ---\n{v}\n--- End of {file_path} ---" for file_path,v in result.items()])
    else:
        return result

In [21]:
read_dir('.',verbose=False)[:200]

'--- File: ./_quarto.yml ---\nproject:\n  type: website\n\nformat:\n  html:\n    theme: cosmo\n    css: styles.css\n    toc: true\n    keep-md: true\n  commonmark: default\n\nwebsite:\n  twitter-card: true\n  open-g'

### PDF reader

In [None]:
#| export
def read_pdf(file_path: str) -> str:
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        return ' '.join(page.extract_text() for page in reader.pages)

In [None]:
read_pdf('./test_dir/test.pdf')

' \n  \n   \nThis is a test PDF document. \nIf you can read this, you have Adobe Acrobat Reader installed on your computer. '

### YT Transcript

In [None]:
#| export
def read_yt_transcript(yt_url):
    from pytube import YouTube
    from youtube_transcript_api import YouTubeTranscriptApi
    try:
        yt = YouTube(yt_url)
        video_id = yt.video_id
    except Exception as e:
        print(f"An error occurred parsing yt urul: {e}")
        return None
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return ' '.join(entry['text'] for entry in transcript) 

In [None]:
yt_url = "https://youtu.be/MRtg6A1f2Ko?si=C7YZU6FFLdi6v9rk"
s = read_yt_transcript(yt_url)
s[:200]

'- [Tim] A widescreen\niPod with touch controls, a revolutionary mobile phone, and a breakthrough internet\ncommunications device. (energetic music) (phone vibrating) Profound new intelligence capabiliti'

### Google Sheet

In [None]:
#| export
def read_google_sheet(orig_url):
    sheet_id = orig_url.split('/d/')[1].split('/')[0]
    csv_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid=0'
    res = requests.get(url=csv_url)
    return res.content

In [None]:
read_google_sheet('https://docs.google.com/spreadsheets/d/17Q3LzRCyM4md28IBxzSSERpaafLgOH8MjH5r6UkyVz8/edit?gid=0#gid=0')

b'Band Pull Around/Aparts\r\nShoulder Dislocations Straight\r\nShoulder Dislocations Side\r\nSuperman Dislocation\r\nScorpion Chest Stretch\r\nLatt Pulldown\r\nTwisty Shoulders\r\nRotator Cuff Pull\r\nWide bent over row'

### Google Doc

In [None]:
#| export
def gdoc_url_to_parseable(url):
    pattern = r'(https://docs\.google\.com/document/d/[^/]+)/edit'
    replacement = r'\1/export?format=html'
    return re.sub(pattern, replacement, url)

In [None]:
# Test the function
result = gdoc_url_to_parseable("https://docs.google.com/document/d/13g-IDyuJyk5wE60bOH1YhhFgW8rlh2LnSXccBS0CQd0/edit")
print(result)

https://docs.google.com/document/d/13g-IDyuJyk5wE60bOH1YhhFgW8rlh2LnSXccBS0CQd0/export?format=html


In [None]:
#| export
def read_gdoc(url):
    import html2text
    doc_url = url
    doc_id = doc_url.split('/d/')[1].split('/')[0]
    export_url = f'https://docs.google.com/document/d/{doc_id}/export?format=html'
    html_doc_content = requests.get(export_url).text
    doc_content = html2text.html2text(html_doc_content)
    return doc_content

In [None]:
read_gdoc("https://docs.google.com/document/d/13g-IDyuJyk5wE60bOH1YhhFgW8rlh2LnSXccBS0CQd0/edit")[:200]

'# Top heading\n\nHello this is a context reading test\n\n## Heading 2\n\nBolded text is here as well as italisized\n\n  * I have bullets\n  * Of things\n\n## Heading 3\n\nAnd ordered\n\n  1. Lists\n  2. Of\n  3. Thing'

## Next:

### GitHub Repo

In [None]:
#| export
import tempfile, subprocess, os, re, shutil
from pathlib import Path

In [None]:
#| export

def gh_ssh_from_gh_url(gh_repo_address):
    "Given a GH URL or SSH remote address, returns a GH URL or None"
    pattern = r'https://github\.com/([^/]+)/([^/]+)(?:/.*)?'
    if gh_repo_address.startswith("git@github.com:"):
        return gh_repo_address
    elif match := re.match(pattern, gh_repo_address):
        user, repo = match.groups()
        return f'git@github.com:{user}/{repo}.git'
    else:
        # Not a GitHub URL or a GitHub SSH remote address
        return None

def get_default_branch(repo_path):
    "master or main"
    try:
        result = subprocess.run(['git', 'symbolic-ref', 'refs/remotes/origin/HEAD'], 
                                cwd=repo_path, capture_output=True, text=True, check=True)
        return result.stdout.strip().split('/')[-1]
    except subprocess.CalledProcessError:
        return 'main'  # Default to 'main' if we can't determine the branch

def get_git_repo(gh_ssh):
    "Fetchs from a GH SSH address, returns a path"
    repo_name = gh_ssh.split('/')[-1].replace('.git', '')
    cache_dir = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache')) / 'contextkit_git_clones'
    cache_dir.mkdir(parents=True, exist_ok=True)
    repo_dir = cache_dir / repo_name

    if repo_dir.exists():
        print("Repo already cached. Updating.")
        try:
            subprocess.run(['git', 'fetch'], cwd=repo_dir, check=True, capture_output=True)
            default_branch = get_default_branch(repo_dir)
            subprocess.run(['git', 'reset', '--hard', f'origin/{default_branch}'], 
                           cwd=repo_dir, check=True, capture_output=True)
            return str(repo_dir)
        except subprocess.CalledProcessError:
            shutil.rmtree(repo_dir)  # Remove the cached directory if update fails

    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            print("Cloning repo.")
            subprocess.run(['git', 'clone', gh_ssh], cwd=temp_dir, check=True, capture_output=False)
            cloned_dir = Path(temp_dir) / repo_name
            shutil.move(str(cloned_dir), str(repo_dir))
            return str(repo_dir)
        except subprocess.CalledProcessError as e:
            print(f"Error cloning repo from cwd {temp_dir} with error {e}")
            return None

def read_git_path(path):
    # TODO: ?enhance to read repos more specifically than directories
    return read_dir(path)

In [None]:
#| export
def read_gh_repo(path_or_url):
    "Repo contents from path, GH URL, or GH SSH address"
    gh_ssh = gh_ssh_from_gh_url(path_or_url)
    print('a')
    print(path_or_url)
    print('b')
    print(get_git_repo(gh_ssh))
    path = path_or_url if not gh_ssh else get_git_repo(gh_ssh)
    return read_git_path(path)

How to use it:

In [None]:
ghurl="https://github.com/AnswerDotAI/claudette"

In [None]:
# s = read_gh_repo(ghurl)