In [1]:
import parsimonious

ModuleNotFoundError: No module named 'parsimonious'

# Reading SPWLA file

The 3 methods for extracting data are:

- Looping over the file and applying string methods, etc.
- Using regex to extract everything at once.
- Using a parser.


## Looping SPWLA file

Let's start by just trying to read the file.

In [2]:
!head -24 ../data/core_analysis_example.spwla

10     2                                                                                                                       
    9999/9-9                                Norway                                            9Sep99
    Weatherford-Labs
15    10   10
          1507      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Hor.
          1512      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Hor.
          1510      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Vert.
          1515      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Vert.
          1402      1211  3084   0Weatherford-Labs    Porosity, Horizontal PLUG
          1403      1211  3084   0Weatherford-Labs    Porosity, Vertical PLUG
          1401      1212  3084   0Weatherford-Labs    Porosity, Summation
          1302      1103  3085   0Weatherford-Labs    CORE Oil Saturation
          1301      1103  3085   0Weatherford-Labs    CORE Water Satur

### Observations

- Some lines are 128 characters wide
- Some of the data is unidentifiable
- This is probably a job for striplog
- The info after the record type (10, 15, 20, 30, etc) seems to be the number of lines (and fields per line, perhaps) in that record, which is redundant information (can just read until the next record type flag)

In [5]:
record_fields = {
    'header': [['well', 'country', 'date'], ['company']],  # Occurs on 2 lines
    'features': ['a', 'b', 'c', 'd', 'company', 'feature'],
    'range': ['w', 'x', 'start', 'stop', 'y', 'z'],
    'depth': ['depth', 'alpha', 'beta'],
    'descr': ['description'],
    'data': ['data'],  # Capture as array
}

In [6]:
record_type = {
    10: 'header',
    15: 'features',
    20: 'range',
    30: 'depth',
    36: 'descr',
    40: 'data',
}

In [172]:
fname = "../data/core_analysis_example.spwla"

with open(fname, 'r') as f:
    data = f.read()

def get_blocks(data):
    for line in data:
        if line[:2].isnumeric():
            code = line[:2]
            continue
        yield code, line

for code, line in get_blocks(data.split('\n')):
    rec_type = record_type[int(code)]
    fields = record_fields[rec_type]
    
    features = []
    if rec_type == 'features':
        features.append(None)

## Regex SPWLA file

From [this StackOverflow question](https://stackoverflow.com/questions/47982949/how-to-parse-complex-text-files-using-python).

In [144]:
s = """10     2                                                                                                                       
    9999/9-9                                Norway                                            9Sep99
    Weatherford-Labs
15    10   10
          1507      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Hor.
          1512      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Hor.
          1510      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Vert.
          1515      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Vert.
          1402      1211  3084   0Weatherford-Labs    Porosity, Horizontal PLUG
          1403      1211  3084   0Weatherford-Labs    Porosity, Vertical PLUG
          1401      1212  3084   0Weatherford-Labs    Porosity, Summation
          1302      1103  3085   0Weatherford-Labs    CORE Oil Saturation
          1301      1103  3085   0Weatherford-Labs    CORE Water Saturation
          2451      1201  1086   0Weatherford-Labs    Grain Density, Hor.
20     1
        0.00     0.00  1918.00  1983.72  0.0  1
30     1
     1918.95     0.00   1.11
40     1   10
     -1002.00000 -1002.00000 -1002.00000 -1002.00000 -1002.00000    18.44722 -1002.00000    14.78718 -1002.00000 -1002.00000
30     1
     1919.95     0.00   2.11
40     1   10
     -1002.00000 -1002.00000 -1002.00000 -1002.00000 -1002.00000    17.06246 -1002.00000    18.06427 -1002.00000 -1002.00000
     """

In [133]:
# TESTING
s = "     1401      1212  3084   0Weatherford-Labs    Porosity, Summation\n"

rx_field = re.compile(r'''
    ^
    \s+?\d+?\s+?\d+?\s+?\d+?\s+?0\S+?\s+?(?P<field>[\w\d][- ,.\w\d]+?)\n
''', re.MULTILINE | re.VERBOSE)

[f.group('field') for f in rx_field.finditer(s)]

['Porosity, Summation']

In [200]:
import re

# Not using this one.
# rx_fields = re.compile(r'''
#     ^
#     15\s+?\d+?\s+?\d+?\n
#     (?P<fields>[\s\S]+?)
#     (?=^[^1]|\Z)
# ''', re.MULTILINE | re.VERBOSE)

rx_fields = re.compile(r'''
    ^
    \s+?\d+?\s+?\d+?\s+?\d+?\s+?0\S+?\s+?(?P<field>[\w\d][- ,.\w\d]+?)\n
''', re.MULTILINE | re.VERBOSE)

rx_depth = re.compile(r'''
    ^
    30\s+?1\n
    \s+?(?P<depth>[.\d]+?)[ \t]+?[.\d]+?[ \t]+?(?P<seq>[.\d]+?)\n
    (?P<record>[\s\S]+?)
    (?=^30|\Z)
''', re.MULTILINE | re.VERBOSE)

rx_data = re.compile(r'''
    ^
    (?:36\s+?1\s+?1\n
    \s+?(?P<descr>.+?)\n)?
    40\s+?1\s+?\d+?\n
    \s+?(?P<data>[- .\d]+?)\n
''', re.MULTILINE | re.VERBOSE)


In [201]:
import pandas as pd
import numpy as np

def parse(s, null=-999.25): 
    
    records = (field.group('field') for field in rx_fields.finditer(s))

    result = (
              (
                float(record.group('depth')),
                record.group('seq'),
                data.group('descr'),
                *[float(x) for x in data.group('data').split()]
              )
               for record in rx_depth.finditer(s)
               for data in rx_data.finditer(record.group('record'))
    )

    columns = ['depth', 'seq', 'descr'] + list(records)
    
    df = pd.DataFrame(result, columns=columns)
    df = df.replace(null, np.nan)

    return df

In [203]:
parse(data, null=-1002)

Unnamed: 0,depth,seq,descr,"Nitrogen Permeability, Hor.","Klinkenberg corrected gas perm, Hor.","Nitrogen Permeability, Vert.","Klinkenberg corrected gas perm, Vert.","Porosity, Horizontal PLUG","Porosity, Vertical PLUG","Porosity, Summation",CORE Oil Saturation,CORE Water Saturation,"Grain Density, Hor."
0,1918.95,1.11,,,,,,,18.44722,,14.78718,,
1,1919.95,2.11,,,,,,,17.06246,,18.06427,,
2,1920.95,3.11,,,,,,,19.58139,,19.01052,,
3,1921.95,4.11,,,,,,,18.32791,,17.92610,,
4,1922.95,5.11,,,,,,,16.67100,,20.77644,,
5,1923.95,6.11,,,,,,,15.49629,,24.60543,,
6,1924.95,7.11,,,,,,,18.28268,,12.88050,,
7,1925.95,8.11,,,,,,,18.43782,,13.75609,,
8,1926.95,9.11,,,,,,,17.31157,,14.66162,,
9,1927.95,10.11,,,,,,,19.34322,,13.08335,,


## Parsing SPWLA file

Let's try using `parsimonious`. Example from [the docs](https://github.com/erikrose/parsimonious):

In [8]:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor

class EntryParser(NodeVisitor):
    def __init__(self, grammar, text):
        self.entry = {}
        ast = Grammar(grammar).parse(text)
        self.visit(ast)
    def visit_name(self, n, vc):
        self.entry['name'] = n.text
    def visit_gender(self, n, vc):
        self.entry['gender'] = n.text
    def visit_age(self, n, vc):
        self.entry['age'] = n.text
    def generic_visit(self, n, vc):
        pass

grammar = """\
entry = name sep gender? (sep age)?
sep = ws "," ws
ws = " "*
name = ~"[A-z]*"
gender = "male" / "female"
age = ~"[0-9]*"
"""

text = """\
Bob, male, 26
Kim,female,30
Joe,male
"""

for line in text.splitlines():
    print(EntryParser(grammar, line).entry)

ModuleNotFoundError: No module named 'parsimonious'

In [2]:
!head -24 ../data/core_analysis_example.spwla

10     2                                                                                                                       
    9999/9-9                                Norway                                            9Sep99
    Weatherford-Labs
15    10   10
          1507      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Hor.
          1512      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Hor.
          1510      1602  2031   0Weatherford-Labs    Nitrogen Permeability, Vert.
          1515      1602  2031   0Weatherford-Labs    Klinkenberg corrected gas perm, Vert.
          1402      1211  3084   0Weatherford-Labs    Porosity, Horizontal PLUG
          1403      1211  3084   0Weatherford-Labs    Porosity, Vertical PLUG
          1401      1212  3084   0Weatherford-Labs    Porosity, Summation
          1302      1103  3085   0Weatherford-Labs    CORE Oil Saturation
          1301      1103  3085   0Weatherford-Labs    CORE Water Satur

In [None]:
# Define a context-free grammar.
grammar = Grammar(
    r"""
    schools         = (school_block / ws)+

    data_block    = depth_header ws data_line 
    descr_block   = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws? 

    school_header   = ~"^School = (.*)"m
    grade_header    = ~"^Grade = (\d+)"m
    depth_header    = "30 lines"
    data_header     = "40" ws "1" ws lines

    number_name     = index comma name ws
    number_score    = index comma score ws

    type            = number+
    lines           = number+

    number          = ~"\d+"
    name            = ~"[A-Z]\w+"
    depth           = ~"[.0-9]\w+"
    description     = ~"[.,- A-Za-z]+"
    ws              = ~"\s*"
    """
)

In [None]:
from parsimonious.nodes import NodeVisitor

class DataVisitor(NodeVisitor):
    def visit_expr(self, node, visited_children):
        """ Returns the overall output. """
        output = {}
        for child in visited_children:
            output.update(child[0])
        return output

    def visit_entry(self, node, visited_children):
        """ Makes a dict of the section (as key) and the key/value pairs. """
        key, values = visited_children
        return {key: dict(values)}

    def visit_section(self, node, visited_children):
        """ Gets the section name. """
        _, section, *_ = visited_children
        return section.text

    def visit_pair(self, node, visited_children):
        """ Gets each key/value pair, returns a tuple. """
        key, _, value, *_ = node.children
        return key.text, value.text

    def generic_visit(self, node, visited_children):
        """ The generic visit method. """
        return visited_children or node

dv = DataVisitor()
out = dv.visit(tree)

print(out)