In [None]:
from pathlib import Path
import re

import pandas as pd
import altair as alt
from parse import parse, search, findall
from lark import Lark
from lark.lexer import Token
from lark.tree import Tree

# Read a Text File

In order to parse some text, you need to get it into your program. Sure, you could hard code it in as a variable, but that's not as useful. So we'll start by looking at ways to use native functions to read text.

Function | Use
---|---
`f.read()` | Read the **entire** file in at once
`f.readlines()` | Read the **entire** file in, split on each line. (`f.read().split('\n')`, but preserves `\n` character)
`f.readline()` | Red **one line** in at a time. Good for very long files that may not fit into memory. (preserves `\n` character)

In [None]:
# All in one variable
with open('data/couch.txt') as f:
    text = f.read()
print(text)

In [None]:
# Parse one "line" at a time, note it keeps the \n, so tell print not to print another new line character
with open('data/couch.txt') as f:
    for i, line in enumerate(f.readlines()):
        print(i, line, end='')

In [None]:
# Read one line at a time if you're worried about a large file using up too much RAM
with open('data/couch.txt') as f:
    while (line := f.readline()):
        print(line, end='')

Python's `pathlib` module can help make this a little cleaner.

In [None]:
# read entire file in with pathlib
txt = Path('data/couch.txt').read_text()
print(txt)

In [None]:
# get each line using pathlib
lines = Path('data/couch.txt').read_text().split('\n')
lines

# Parsing Info From Text File With String Methods
https://docs.python.org/3/library/stdtypes.html#string-methods

In [None]:
# Filter out blank lines with the truthiness of the line
# This only works because we've stripped the \n character from each line
for line in lines:
    if line:
        print(line)

In [None]:
# Get any line that has a `:` and split it into the variable and the value
variables = dict()
for line in lines:
    parts = line.split(':', 1)
    if len(parts) > 1:
        variables[parts[0].strip()] = parts[1].strip()
variables

In [None]:
# Use string methods to only get numeric types
for line in lines:
    parts = line.split(':', 1)
    try:
        if parts[1].strip().isnumeric():
            print(line)
    except IndexError:
        pass   

In [None]:
# MPI version has a `.` which does not work. You could try casting to a float
for line in lines:
    parts = line.split(':', 1)
    try:
        float(parts[1])
        print(line)
    except (IndexError, ValueError):
        pass   

# Parse using Regular Expressions
https://docs.python.org/3/library/re.html

https://regex101.com/

Function | Use
---|---
`re.search(pattern, text)` | See if `pattern` is **in** `text`, return first
`re.match(pattern, text)` | See if `text` **starts** with `pattern`, return first
`re.findall(pattern, text)` | Find **all** occurences of `pattern` in `text`, returns **string**
`re.finditer(pattern, text)` | Find **all** occurence of `pattern` in `text`, returns **match object**
`re.split(pattern, text, max)`| Split `text` on `pattern`

Pattern | Meaning
---|---
`.` | Match **anything** other than a new line `\n`
`^` | Match at **start** of text
`$` | Match at **end** of text
`*` | Pattern appears **0 or more** Times
`+` | Pattern appears **1 or more** Times
`?` | Pattern appears **0 or 1** Times
`{m}` | Pattern appears **m** number of Times
`{m,n}` | Pattern appears **between `m` and `n`** Times
`[]` | Define a **set** of characters to match
`()` | Define a **group** of characters to match

Sequence | Meaning | Sequence | Meaning
---|---|---|---
`\d` | Any digit 0-9 | `\D` | Anything but a digit 0-9
`\s` | Any whitespace `[ \t\n\r\f\v]` | `\S` | Anything but whitespace
`\w` | Any word character `[a-zA-Z0-9]` | `\W` | Anything but a word character

In [None]:
pattern = '\w+ Discord'
text = "The Python Discord is cool, but the PyRVA Discord is better!"
print(re.match(pattern, text))
print(re.search(pattern, text))
print(re.findall(pattern, text))

In [None]:
# What's up with `search`? It returned a `match object` where you can extract the sub-patterns.
# The whole string will always be group 0 while sub groups start at 1.
# `match` would have returned a `match object` if there was a match.
result = re.search(pattern, text)
result.group(0)

In [None]:
# () defines a sub-group, in this case, the names of the servers.
pattern = '(\w+) Discord'
[r.group(1) for r in re.finditer(pattern, text)]

In [None]:
# Let's look at a small section of a file.
print(txt)

In [None]:
# Lets have a date! We can contrive a patter that will match the dates. Note how the developers were annoying and used a bunch of different formats
pattern = '[a-zA-Z]+\s+\d+(, \d+)?\s+\d+:\d+:\d+'
for line in lines:
    if match := re.search(pattern, line): # I am the walrus!
        print(line)
        print(match)
        print(match.group(0))
        print(match.group(1))
        print()

In [None]:
# First date! search will return the first match it finds.
re.search(pattern, txt)

In [None]:
# All dates! findall and finditer will find multiple matches.
list(re.finditer(pattern, txt))

In [None]:
# Just the date. Use group(0) to extract just the matched part
[m.group(0) for m in re.finditer(pattern, txt)]

In [None]:
# You could make things a *little* eaiser to read by breaking it up, but this really isn't eactly easier to read either.
date = '[a-zA-Z]+\s+\d+'
year = '(, \d+)?'
time = '\s+\d+:\d+:\d+'
pattern = date + year + time
list(re.finditer(pattern, txt))

In [None]:
# What if we want to see how the model progressed over time. We can extract data and look at the simulation time vs wall clock time.
# Here is the info we want to parse. The wall clock time is on one line while the total time is on another.
# Time Step       1   March  7, 2021  22:29:32
# Step Size:    0.102E+00 s, Total Time:       0.10 s

pattern = (
    'Time Step\s+(?P<timestep>\d+)'
    '\s+(?P<date>[a-zA-Z]+\s+\d+,\s+\d+\s+\d+:\d+:\d+)'
    '\s+Step Size:\s+(?P<stepsize>[0-9\.E+-]+)\s+s,'
    '\s+Total Time:\s+(?P<simtime>[0-9\.]+)'
)  # Python will concatenate strings that don't have a comma

out = Path('data/couch/couch.out').read_text()
df = pd.DataFrame([r.groupdict() for r in re.finditer(pattern, out)])
df['date'] = pd.to_datetime(df.date)
df

In [None]:
alt.Chart(df).mark_line().encode(x='date', y='simtime:Q')

# Parse Stuff with Parse
https://pypi.org/project/parse/

>`parse()` is the opposite of `format()`

Function | Use | Regular Expression
---|---|---
`parse(ptn, txt)` | See if `txt` **starts** with `ptn`, return first | `re.match`
`search(ptn, txt)` | See if `ptn` is **in** `txt`, return first | `re.search`
`findall(ptn, txt)` | Find **all** occurences of`ptn` in `txt` | `re.findall`


There is support for regex, but you can discover that on your own.

In [None]:
# Here is the snippet of text again
# Time Step       1   March  7, 2021  22:29:32
# Step Size:    0.102E+00 s, Total Time:       0.10 s

# This pattern is MUUUCH more readable
pattern = "Time Step {:>d} {:^} Step Size: {:>} s, Total Time: {:>d}"
result = search(pattern, out)
result

In [None]:
# Get positional arguments
result.fixed

In [None]:
# You can also suppply names to the different parameters
pattern = "Time Step {timestep:>d} {date:^} Step Size: {stepsize:>} s, Total Time: {simtime:>d}"
result = search(pattern, out)
result

In [None]:
# Extract the named variables
result.named

In [None]:
# Lets load the list of dictionaries into pandas to have a look.
pd.DataFrame([r.named for r in findall(pattern, out)])

# Parse Stuff with Pandas
- https://pandas.pydata.org/
  - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
  - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html

In [None]:
# Parse csv files
hrr = pd.read_csv('data/couch/couch_hrr.csv', header=1)
hrr

In [None]:
alt.Chart(hrr).mark_line().encode(x='Time', y='HRR')

In [None]:
# Read a file formatted in fixed width font
pd.read_fwf('data/fwf.txt')

In [None]:
# Read html tables on a web page
url = 'http://toscrape.com/'
html_tables = pd.read_html(url)
html_tables[1]

In [None]:
# Read from a JSON api
url = 'https://data.virginia.gov/resource/bre9-aqqr.json'
pd.read_json(url)

# Parse using Grammar Parser
https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form

https://github.com/lark-parser/lark

In [None]:
# This gets a bit complex, but is a very powerful tool
grammar = '''
start: record+
record: "&" NAMELIST (_SEP keyval)* "/"
keyval: PARAM "=" (VALUE | value_list)
value_list: VALUE (_SEP VALUE)+

NAMELIST: UCASE_LETTER~4
PARAM: CHAR+
VALUE: SIGNED_NUMBER | QUOTED_STRING | ".FALSE." | ".TRUE."
CHAR: UCASE_LETTER | DIGIT | "_"
QUOTED_STRING: "'" _STRING_INNER "'"
_SEP: WS | ","

%import common._STRING_INNER
%import common.DIGIT
%import common.SIGNED_NUMBER
%import common.UCASE_LETTER
%import common.NEWLINE
%import common.WS
%ignore WS
'''
parser = Lark(grammar)

In [None]:
INDENT = 4
def display(node, indent: int = INDENT):
    """Nicely display the AST."""
    _ind = " " * indent
    if isinstance(node, Token):
        print(_ind, node.line, node.column, node.type, node.value)
    else:
        print(_ind, node.data)
        for child in node.children:
            display(child, indent + INDENT)

In [None]:
# Parse an input file
tree = parser.parse(Path('data/couch/couch.fds').read_text())
display(tree)

# The Future of Parsing in Python

- [PEP 622 - Structural Pattern Matching](https://www.python.org/dev/peps/pep-0622/)
- [PEP 634 - Structural Pattern Matching: Specification](https://www.python.org/dev/peps/pep-0634/)
- [PEP 635 - Structural Pattern Matching: Motivation and Rationale](https://www.python.org/dev/peps/pep-0635/)
- [PEP 636 - Structural Pattern Matching: Tutorial](https://www.python.org/dev/peps/pep-0636/)
- [Python 3.10 Pattern Matching in Action](https://www.youtube.com/watch?v=SYTVSeTgL3s)
- [Pattern matching tutorial for Pythonic code](https://mathspp.com/blog/pydonts/pattern-matching-tutorial-for-pythonic-code)