# DLISIO in a Nutshell

## Importing

In [None]:
%matplotlib inline

import os
import pandas as pd
import dlisio
import matplotlib.pyplot as plt
import numpy as np
import numpy.lib.recfunctions as rfn

### You can work with a single file using the cell below - or by adding an additional for loop to the code below, you can work through a list of files.  Another option is to use os.walk to get all .dlis files in a parent folder.  Example:

    for (root, dirs, files) in os.walk(folderpath):
        for f in files:
            filepath = os.path.join(root, f)
            if filepath.endswith('.' + 'dlis'):
                print(filepath)
                
### But for this example, we will work with a single .dlis file specified in the cell below.  Note that there are some .dlis file formats that are not supported by DLISIO yet - good to catch them in a try except loop if you are reading files enmasse.

### We will load a file from the open source Volve dataset available here: https://data.equinor.com/dataset/Volve

In [None]:
filepath = r"C:\Users\aruss\Downloads\Volve_Well_logs_pr_WELL\15_9-F-4\02.LWD_EWL\WLC_RAW_CAL-DEN-GR-NEU-REMP_MWD_1.DLIS"

## Reading a dlis file

### DLISIO will check each channel (curve metadata), which we will build a pandas dataframe for, then look in each frame for the curve values. There can be multiple frames, so the frames are stored in the list called curves_l.  You can also print the objects in the files - a variety of metadata, but it will lock up your memory.

In [None]:
curves_L = []
curves_name = []
longs = []
unit = []
with dlisio.load(filepath) as d:
    for channel in d.channels:
        curves_name.append(channel.name)
        longs.append(channel.long_name)
        unit.append(channel.units)
    curve_index = pd.DataFrame(
    {'Curve': curves_name,
     'Long': longs,
     'Unit': unit
    })
    for fram in d.frames:
        fingerprint = fram.fingerprint
        curves = d.curves(fingerprint)
        curves_L.append(curves)
    #for obj in d.objects:
        #print(obj)

## Curve Index

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
curve_index[0:10]

### Looks like we have some duplicates - we will catch these with a little bit of pandas and get a unique column name for each curve.

In [None]:
curve_index['Dup'] = curve_index.duplicated('Curve')
curve_index['Curve_Name'] = np.where(curve_index['Dup']==True, curve_index['Curve']+"_1", curve_index['Curve'])

In [None]:
curve_index[0:10]

## Origin information (well name and header)

### Using the origin. method you can print any item individually from the header, such as the well name below.

In [None]:
with dlisio.load(filepath) as d:
    for origin in d.origin:
        print(origin)
        well_name = origin.well_name

In [None]:
print ("Well Name = " + str(well_name))

## Curve Values

### The cell below will tell you how many frames exist in the file.  If you have mulitple frames, you will probably want to loop through the frames.

In [None]:
len(curves_L)

### We will look at one frame.  Curve values are outputted in a form called a structured array.  It takes a few more steps to work with them

In [None]:
curves_L[1]

### You can query all the curves that exist in the frame using the dtype.names:

In [None]:
print(curves_L[1].dtype.names)

### We can look at the values for one curve - for example looking at both TDEPs for each frame.

In [None]:
curves_L[1]["TDEP"]

In [None]:
curves_L[0]["TDEP"]

### Or look at the values for a list of curves

In [None]:
new_array = curves_L[1][['TDEP', 'RHOB','DRHO']]

In [None]:
new_array

### Let's get the units and the long descriptions then of these three curves from the curve index.  And then plot the array.  We will set the -999.25 values to nan.

In [None]:
curve_reindexed = curve_index.set_index('Curve_Name')
curve_reindexed.loc[['TDEP','RHOB','DRHO']]

In [None]:
df = pd.DataFrame.from_records(new_array)
df = df.set_index('TDEP')
df = df.replace(-999.250000,np.NaN)

In [None]:
df.plot(kind='line',subplots=True,grid=True, legend='reverse')

## Compound Arrays

### Now sometimes curve values will have an array for a value at a certain index number - this gets trickier to handle.  Petrel for example won't even recognize that these curves exist in the dlis file.  These are set up as compound arrays.  Curve SAZ1 below has several samples that are actually arrays.

In [None]:
print(curves_L[1].dtype.names[44])
curves_L[1][44]

### So we convert everything to tuples, which allows the arrays to be values at index levels in a pandas dataframe.  We get the column names from the nparray.

In [None]:
converted_curves = tuple(curves_L[1])
curves_df = pd.DataFrame.from_records(converted_curves, columns=curves_L[1].dtype.names)

In [None]:
curves_df = curves_df.set_index("TDEP")
curves_df = curves_df.replace(-999.25,np.NaN)

In [None]:
curves_df.iloc[300:315]

In [None]:
curves_df.iloc[:,0:20].plot(kind='line',subplots=True, figsize=(10,100),grid=True, legend='reverse')

### We can use the value_counts method to see how many curves actually represent multiple values per sample:

In [None]:
curves_df.dtypes.value_counts()

### And find their names and data:

In [None]:
curves_df.select_dtypes(['object']).iloc[300:315]

### Taking another look at cuve SAZ1, we can see that it has arrays for each index and its type is an object.

In [None]:
curve_reindexed.loc[['SAZ1']]

In [None]:
curves_df.SAZ1.iloc[0:10]

### You can always expand out the values into their own columns using the apply method below:

In [None]:
curves_df['SAZ1'].apply(pd.Series).iloc[300:315]

# Hopefully that is enough code to get you started working with DLISIO.  There is much more functionality which can be accessed with help(dlisio)