# Reading data from files

In [1]:
import glob
import re
from collections import namedtuple

import numpy as np
import pandas as pd

import pint
ureg = pint.UnitRegistry()

In [2]:
#list files
uwis = glob.glob('../../data/reading_files/UWI*')
txts = [file for file in glob.glob('../../data/reading_files/*') if 'UWI' not in file]
wells = glob.glob('../../data/reading_files/*')

In [3]:
def make_lines_from_file(fname):
    """Make a list of all lines in a given file based on `fname` path"""
    with open(f'../../data/reading_files/{fname}', 'r') as f:
        return f.readlines()

In [4]:
def find_idxs(lines, comment='#'):
    """find the indexes of the Formation and Porosity headers given an input comment"""
    top_form = lines.index(f'{comment} Formations\n')
    top_pors = lines.index(f'{comment} Porosity\n')
    return top_form, top_pors

In [5]:
def get_units(lines):
    """Get the units from a lines object, parse the line and stop once a unit is found."""
    for line in lines:
        if 'units: m' in line.lower():
            units = 'm'
            break
        elif 'units: f' in line.lower():
            units = 'ft'
            break
        else:
            units = None
    return units

In [6]:
def find_kb(lines, comment='#'):
    """Use re to match a depth if `kb:` is the line."""
    pattern = r'\d{1,}.\d{0,}'
    for line in lines:
        if 'kb:' in line.lower():
            return re.findall(pattern, line)
    return None

In [7]:
def find_comment(lines):
    """Parse lines to find the first non-alpha and non-numeric character, assign this to comment."""
    for line in lines:
        if not line[0].isalpha() and not line[0].isdigit():
            comment = line[0]
            break
    return comment

In [8]:
def find_sep(lines):
    """Parse lines to find a charcter in the porosities (to limit other characters),
    that is not alphanumeric and is not a `.` nor newline character, assign this to the separator.
    """
    _, top_pors = find_idxs(lines, comment=find_comment(lines))
    sep = [char for char in lines[top_pors + 1] if not char.isalpha() 
                                                and not char.isdigit() 
                                                and char not in {'.', '\n'}]
    
    return sep[0]

In [9]:
def get_xy(lines, convert_units=True):
    """Use re to match a coordinates pattern if `loc` is found in the line.
    Unit conversion are made only from ft to m.
    """
    pattern = r'-?\d{1,}\.\d{1,}'
    coords = namedtuple('coords', 'x y')
    units = get_units(lines)
    for line in lines:
        if 'loc:' in line.lower():
            coords.x, coords.y = re.findall(pattern, line)
            break
    if convert_units and units == 'ft':
        x_ft = float(coords.x) * ureg.ft
        y_ft = float(coords.y) * ureg.ft
        x_m = x_ft.to(ureg.m).magnitude
        y_m = y_ft.to(ureg.m).magnitude
        return x_m, y_m
    else:
        return float(coords.x), float(coords.y)

In [10]:
def get_pors_from_lines(lines, top_pors, depth_units, poro_to_SI=False, sep=',', tvdss_from_md=False, kb=None):
    """Get porosities from file and convert units"""
    porosities = {}
    
    for line in lines[top_pors + 1:]:
        depth_str, poro_str = line.split(sep)
        
        try:
            depth = float(depth_str)
        except ValueError:
            depth = None
        if tvdss_from_md and kb:
            depth -= kb
            
        poro = float(poro_str)
        
        if poro_to_SI and depth_units == 'm':
            if poro < 1:
                poro *= 100
      
        porosities[depth] = poro
    
    return porosities

In [11]:
def read_tops_from_file(fname, 
                        null_vals=None,
                        keep_nulls=True,
                        convert_units=True,
                        poro_to_SI=False,
                        comment='#', 
                        sep=',', 
                        tvdss_from_md=False):
    """Read a single well file and extract formations and depths to a dictionary"""
    lines = make_lines_from_file(fname)
    top_form, top_pors = find_idxs(lines, comment)
    kb = float(find_kb(lines, comment=comment)[0])
    units = get_units(lines)

    tops = {}
    for line in lines[top_form + 1:top_pors]:
        name, depth_str = (line.split(sep))
        
        try:
            depth = float(depth_str)
        except ValueError:
            depth = null_vals
            
        if tvdss_from_md:
            depth -= kb
            
        if not keep_nulls:
            if depth == null_vals:
                continue
        
        if convert_units and units == 'ft':
            depth_ft = depth*ureg.ft
            depth = depth_ft.to(ureg.m).magnitude
        
        tops[name] = depth
    
    porosities = get_pors_from_lines(lines, 
                                     top_pors, 
                                     units, 
                                     poro_to_SI=poro_to_SI, 
                                     sep=sep, 
                                     tvdss_from_md=tvdss_from_md, 
                                     kb=kb
                                    )
    
        
    return tops, units, porosities

In [12]:
#fnames = ['UWI_4900521053.txt', 'UWI_4900523208.txt', 'UWI_4900522918.txt']
#for fname in fnames:
#    tops, units, porosities = read_tops_from_file(fname, poro_to_SI=True)
#    print(f'{80*"_"}\n{fname}\n{18*"="}\ntops:\n\n{tops}\n{18*"="}\nunits: {units}\n{18*"="}\nporosities:\n\n{porosities}\n{18*"="}\n')

In [13]:
#fname = '4900523357.txt'
#tops, units, porosities = read_tops_from_file(fname, comment='%', sep='\t', poro_to_SI=True, tvdss_from_md=True)
#print(f'{80*"_"}\n{fname}\n{18*"="}\ntops:\n\n{tops}\n{18*"="}\nunits: {units}\n{18*"="}\nporosities:\n\n{porosities}\n{18*"="}\n')

In [14]:
def parse_file(fname):
    """Parse file to extract metadata"""
    lines = make_lines_from_file(fname)
    comment = find_comment(lines)
    sep = find_sep(lines)
    x, y = get_xy(lines)
    return lines, comment, sep, x, y

In [15]:
def gather_all_wells(wells, verbose=False):
    ""
    all_tops = {}
    if verbose:
        print(f'Logging:\n{40*"="}')
    for well in wells:
        well_name = well.split('/')[-1]
        lines, comment, sep, x, y = parse_file(well)
        if verbose:
            print(f'\nWell name:\t{well_name}\ncomment:\t{comment}\nsep:\t\t{sep.__repr__()}\nx:\t\t{x}\ny:\t\t{y}')
        tops, units, porosities = read_tops_from_file(well,
                                                      comment=comment, 
                                                      sep=sep, 
                                                      poro_to_SI=True, 
                                                      tvdss_from_md=True)

        all_tops[well_name] = {'units': units,
                               'x': x,
                               'y': y,
                               'tops': tops,
                               'porosities': porosities}
    if verbose:
        print(f'\n{16*"="}\nall_tops created')
    return all_tops

In [17]:
all_tops = gather_all_wells(wells, verbose=False)
all_tops

{'UWI_4900527320.txt': {'units': 'ft',
  'x': 40629.99849599999,
  'y': -34852.999127999996,
  'tops': {'Ardmore': 920.4959999999999,
   'Sussex Upper Base': 940.0031999999999,
   'Sussex Upper Top': 934.5167999999999,
   'Cody': 929.0303999999999,
   'Niobrara': 944.2703999999999,
   'Sussex Lower Base': 943.6607999999999,
   'Sussex Lower Top': 940.9175999999999},
  'porosities': {3062.0: 7.5,
   3064.0: 7.5,
   3066.0: 12.0,
   3067.0: 13.3,
   3068.0: 12.0,
   3070.0: 9.7,
   3072.0: 10.3,
   3074.0: 11.0,
   3076.0: 12.0,
   3078.0: 11.0,
   3080.0: 10.0,
   3082.0: 9.7,
   3084.0: 9.2,
   3086.0: 10.6,
   3088.0: 5.0,
   3090.0: 5.0,
   3092.0: 12.2,
   3094.0: 13.2,
   3096.0: 8.0,
   3098.0: 5.3,
   3100.0: 4.2,
   3102.0: 6.0,
   3104.0: 6.2,
   3106.0: 6.3,
   3108.0: 6.0,
   3110.0: 5.7,
   3112.0: 5.5}},
 'UWI_4900523333.txt': {'units': 'ft',
  'x': 36110.000424,
  'y': -29575.000031999996,
  'tops': {'Ardmore': 930.2495999999999,
   'Cody': 943.0511999999999,
   'Sussex Up

In [19]:
all_tops['UWI_4900527320.txt'].keys()

dict_keys(['units', 'x', 'y', 'tops', 'porosities'])